lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2014, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <linux/pagemap.h>
  46 #include <linux/file.h>
  47 #include <linux/sched.h>
  48 #include <linux/user_namespace.h>
  49 #ifdef HAVE_UIDGID_HEADER
  50 # include <linux/uidgid.h>
  51 #endif
  52 #include <lustre/ll_fiemap.h>
  53 #include <lustre_ioctl.h>
  54
  55 #include "cl_object.h"
  56
  57 #include "llite_internal.h"
  58 #include "vvp_internal.h"
  59
  60 static int
  61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  62
  63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  64                           bool *lease_broken);
  65
  66 static enum llioc_iter
  67 ll_iocontrol_call(struct inode *inode, struct file *file,
  68                   unsigned int cmd, unsigned long arg, int *rcp);
  69
  70 static struct ll_file_data *ll_file_data_get(void)
  71 {
  72         struct ll_file_data *fd;
  73
  74         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  75         if (fd == NULL)
  76                 return NULL;
  77
  78         fd->fd_write_failed = false;
  79
  80         return fd;
  81 }
  82
  83 static void ll_file_data_put(struct ll_file_data *fd)
  84 {
  85         if (fd != NULL)
  86                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  87 }
  88
  89 /**
  90  * Packs all the attributes into @op_data for the CLOSE rpc.
  91  */
  92 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  93                              struct obd_client_handle *och)
  94 {
  95         ENTRY;
  96
  97         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  98                            0, 0, LUSTRE_OPC_ANY, NULL);
  99
 100         op_data->op_attr.ia_mode = inode->i_mode;
 101         op_data->op_attr.ia_atime = inode->i_atime;
 102         op_data->op_attr.ia_mtime = inode->i_mtime;
 103         op_data->op_attr.ia_ctime = inode->i_ctime;
 104         op_data->op_attr.ia_size = i_size_read(inode);
 105         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 106                                      ATTR_MTIME | ATTR_MTIME_SET |
 107                                      ATTR_CTIME | ATTR_CTIME_SET;
 108         op_data->op_attr_blocks = inode->i_blocks;
 109         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 110         op_data->op_handle = och->och_fh;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 130                                      struct obd_client_handle *och,
 131                                      struct inode *inode,
 132                                      enum mds_op_bias bias,
 133                                      void *data)
 134 {
 135         struct obd_export       *exp = ll_i2mdexp(inode);
 136         struct md_op_data       *op_data;
 137         struct ptlrpc_request   *req = NULL;
 138         struct obd_device       *obd = class_exp2obd(exp);
 139         int                      rc;
 140         ENTRY;
 141
 142         if (obd == NULL) {
 143                 /*
 144                  * XXX: in case of LMV, is this correct to access
 145                  * ->exp_handle?
 146                  */
 147                 CERROR("Invalid MDC connection handle "LPX64"\n",
 148                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 149                 GOTO(out, rc = 0);
 150         }
 151
 152         OBD_ALLOC_PTR(op_data);
 153         if (op_data == NULL)
 154                 /* XXX We leak openhandle and request here. */
 155                 GOTO(out, rc = -ENOMEM);
 156
 157         ll_prepare_close(inode, op_data, och);
 158         switch (bias) {
 159         case MDS_CLOSE_LAYOUT_SWAP:
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 op_data->op_fid2 = *ll_inode2fid(data);
 165                 break;
 166
 167         case MDS_HSM_RELEASE:
 168                 LASSERT(data != NULL);
 169                 op_data->op_bias |= MDS_HSM_RELEASE;
 170                 op_data->op_data_version = *(__u64 *)data;
 171                 op_data->op_lease_handle = och->och_lease_handle;
 172                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 173                 break;
 174
 175         default:
 176                 LASSERT(data == NULL);
 177                 break;
 178         }
 179
 180         rc = md_close(md_exp, op_data, och->och_mod, &req);
 181         if (rc) {
 182                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 183                        ll_i2mdexp(inode)->exp_obd->obd_name,
 184                        PFID(ll_inode2fid(inode)), rc);
 185         }
 186
 187         if (rc == 0 &&
 188             op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
 189                 struct mdt_body *body;
 190
 191                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 192                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 193                         rc = -EBUSY;
 194         }
 195
 196         ll_finish_md_op_data(op_data);
 197         EXIT;
 198 out:
 199
 200         md_clear_open_replay_data(md_exp, och);
 201         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 202         OBD_FREE_PTR(och);
 203
 204         if (req) /* This is close request */
 205                 ptlrpc_req_finished(req);
 206         return rc;
 207 }
 208
 209 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 210 {
 211         struct ll_inode_info *lli = ll_i2info(inode);
 212         struct obd_client_handle **och_p;
 213         struct obd_client_handle *och;
 214         __u64 *och_usecount;
 215         int rc = 0;
 216         ENTRY;
 217
 218         if (fmode & FMODE_WRITE) {
 219                 och_p = &lli->lli_mds_write_och;
 220                 och_usecount = &lli->lli_open_fd_write_count;
 221         } else if (fmode & FMODE_EXEC) {
 222                 och_p = &lli->lli_mds_exec_och;
 223                 och_usecount = &lli->lli_open_fd_exec_count;
 224         } else {
 225                 LASSERT(fmode & FMODE_READ);
 226                 och_p = &lli->lli_mds_read_och;
 227                 och_usecount = &lli->lli_open_fd_read_count;
 228         }
 229
 230         mutex_lock(&lli->lli_och_mutex);
 231         if (*och_usecount > 0) {
 232                 /* There are still users of this handle, so skip
 233                  * freeing it. */
 234                 mutex_unlock(&lli->lli_och_mutex);
 235                 RETURN(0);
 236         }
 237
 238         och = *och_p;
 239         *och_p = NULL;
 240         mutex_unlock(&lli->lli_och_mutex);
 241
 242         if (och != NULL) {
 243                 /* There might be a race and this handle may already
 244                  * be closed. */
 245                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 246                                                och, inode, 0, NULL);
 247         }
 248
 249         RETURN(rc);
 250 }
 251
 252 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 253                        struct file *file)
 254 {
 255         ldlm_policy_data_t policy = {
 256                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 257         };
 258         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 259         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 260         struct ll_inode_info *lli = ll_i2info(inode);
 261         struct lustre_handle lockh;
 262         int lockmode;
 263         int rc = 0;
 264         ENTRY;
 265
 266         /* clear group lock, if present */
 267         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 268                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 269
 270         if (fd->fd_lease_och != NULL) {
 271                 bool lease_broken;
 272
 273                 /* Usually the lease is not released when the
 274                  * application crashed, we need to release here. */
 275                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 276                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 277                         PFID(&lli->lli_fid), rc, lease_broken);
 278
 279                 fd->fd_lease_och = NULL;
 280         }
 281
 282         if (fd->fd_och != NULL) {
 283                 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
 284                                                NULL);
 285                 fd->fd_och = NULL;
 286                 GOTO(out, rc);
 287         }
 288
 289         /* Let's see if we have good enough OPEN lock on the file and if
 290            we can skip talking to MDS */
 291         mutex_lock(&lli->lli_och_mutex);
 292         if (fd->fd_omode & FMODE_WRITE) {
 293                 lockmode = LCK_CW;
 294                 LASSERT(lli->lli_open_fd_write_count);
 295                 lli->lli_open_fd_write_count--;
 296         } else if (fd->fd_omode & FMODE_EXEC) {
 297                 lockmode = LCK_PR;
 298                 LASSERT(lli->lli_open_fd_exec_count);
 299                 lli->lli_open_fd_exec_count--;
 300         } else {
 301                 lockmode = LCK_CR;
 302                 LASSERT(lli->lli_open_fd_read_count);
 303                 lli->lli_open_fd_read_count--;
 304         }
 305         mutex_unlock(&lli->lli_och_mutex);
 306
 307         if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 308                            LDLM_IBITS, &policy, lockmode, &lockh))
 309                 rc = ll_md_real_close(inode, fd->fd_omode);
 310
 311 out:
 312         LUSTRE_FPRIVATE(file) = NULL;
 313         ll_file_data_put(fd);
 314
 315         RETURN(rc);
 316 }
 317
 318 /* While this returns an error code, fput() the caller does not, so we need
 319  * to make every effort to clean up all of our state here.  Also, applications
 320  * rarely check close errors and even if an error is returned they will not
 321  * re-try the close call.
 322  */
 323 int ll_file_release(struct inode *inode, struct file *file)
 324 {
 325         struct ll_file_data *fd;
 326         struct ll_sb_info *sbi = ll_i2sbi(inode);
 327         struct ll_inode_info *lli = ll_i2info(inode);
 328         int rc;
 329         ENTRY;
 330
 331         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 332                PFID(ll_inode2fid(inode)), inode);
 333
 334 #ifdef CONFIG_FS_POSIX_ACL
 335         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 336             inode == inode->i_sb->s_root->d_inode) {
 337                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 338
 339                 LASSERT(fd != NULL);
 340                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 341                         fd->fd_flags &= ~LL_FILE_RMTACL;
 342                         rct_del(&sbi->ll_rct, current_pid());
 343                         et_search_free(&sbi->ll_et, current_pid());
 344                 }
 345         }
 346 #endif
 347
 348         if (inode->i_sb->s_root != file->f_path.dentry)
 349                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 350         fd = LUSTRE_FPRIVATE(file);
 351         LASSERT(fd != NULL);
 352
 353         /* The last ref on @file, maybe not the the owner pid of statahead,
 354          * because parent and child process can share the same file handle. */
 355         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 356                 ll_deauthorize_statahead(inode, fd);
 357
 358         if (inode->i_sb->s_root == file->f_path.dentry) {
 359                 LUSTRE_FPRIVATE(file) = NULL;
 360                 ll_file_data_put(fd);
 361                 RETURN(0);
 362         }
 363
 364         if (!S_ISDIR(inode->i_mode)) {
 365                 if (lli->lli_clob != NULL)
 366                         lov_read_and_clear_async_rc(lli->lli_clob);
 367                 lli->lli_async_rc = 0;
 368         }
 369
 370         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 371
 372         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 373                 libcfs_debug_dumplog();
 374
 375         RETURN(rc);
 376 }
 377
 378 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
 379                                 struct lookup_intent *itp)
 380 {
 381         struct dentry *de = file->f_path.dentry;
 382         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 383         struct dentry *parent = de->d_parent;
 384         const char *name = NULL;
 385         int len = 0;
 386         struct md_op_data *op_data;
 387         struct ptlrpc_request *req = NULL;
 388         int rc;
 389         ENTRY;
 390
 391         LASSERT(parent != NULL);
 392         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 393
 394         /* if server supports open-by-fid, or file name is invalid, don't pack
 395          * name in open request */
 396         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 397             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 398                 name = de->d_name.name;
 399                 len = de->d_name.len;
 400         }
 401
 402         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 403                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 404         if (IS_ERR(op_data))
 405                 RETURN(PTR_ERR(op_data));
 406         op_data->op_data = lmm;
 407         op_data->op_data_size = lmmsize;
 408
 409         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 410                             &ll_md_blocking_ast, 0);
 411         ll_finish_md_op_data(op_data);
 412         if (rc == -ESTALE) {
 413                 /* reason for keep own exit path - don`t flood log
 414                  * with messages with -ESTALE errors.
 415                  */
 416                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 417                      it_open_error(DISP_OPEN_OPEN, itp))
 418                         GOTO(out, rc);
 419                 ll_release_openhandle(de, itp);
 420                 GOTO(out, rc);
 421         }
 422
 423         if (it_disposition(itp, DISP_LOOKUP_NEG))
 424                 GOTO(out, rc = -ENOENT);
 425
 426         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 427                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 428                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 429                 GOTO(out, rc);
 430         }
 431
 432         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 433         if (!rc && itp->d.lustre.it_lock_mode)
 434                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 435
 436 out:
 437         ptlrpc_req_finished(req);
 438         ll_intent_drop_lock(itp);
 439
 440         RETURN(rc);
 441 }
 442
 443 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 444                        struct obd_client_handle *och)
 445 {
 446         struct ptlrpc_request *req = it->d.lustre.it_data;
 447         struct mdt_body *body;
 448
 449         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 450         och->och_fh = body->mbo_handle;
 451         och->och_fid = body->mbo_fid1;
 452         och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 453         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 454         och->och_flags = it->it_flags;
 455
 456         return md_set_open_replay_data(md_exp, och, it);
 457 }
 458
 459 static int ll_local_open(struct file *file, struct lookup_intent *it,
 460                          struct ll_file_data *fd, struct obd_client_handle *och)
 461 {
 462         struct inode *inode = file->f_path.dentry->d_inode;
 463         ENTRY;
 464
 465         LASSERT(!LUSTRE_FPRIVATE(file));
 466
 467         LASSERT(fd != NULL);
 468
 469         if (och) {
 470                 int rc;
 471
 472                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 473                 if (rc != 0)
 474                         RETURN(rc);
 475         }
 476
 477         LUSTRE_FPRIVATE(file) = fd;
 478         ll_readahead_init(inode, &fd->fd_ras);
 479         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 480
 481         /* ll_cl_context initialize */
 482         rwlock_init(&fd->fd_lock);
 483         INIT_LIST_HEAD(&fd->fd_lccs);
 484
 485         RETURN(0);
 486 }
 487
 488 /* Open a file, and (for the very first open) create objects on the OSTs at
 489  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 490  * creation or open until ll_lov_setstripe() ioctl is called.
 491  *
 492  * If we already have the stripe MD locally then we don't request it in
 493  * md_open(), by passing a lmm_size = 0.
 494  *
 495  * It is up to the application to ensure no other processes open this file
 496  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 497  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 498  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 499  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 500  */
 501 int ll_file_open(struct inode *inode, struct file *file)
 502 {
 503         struct ll_inode_info *lli = ll_i2info(inode);
 504         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 505                                           .it_flags = file->f_flags };
 506         struct obd_client_handle **och_p = NULL;
 507         __u64 *och_usecount = NULL;
 508         struct ll_file_data *fd;
 509         int rc = 0;
 510         ENTRY;
 511
 512         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 513                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 514
 515         it = file->private_data; /* XXX: compat macro */
 516         file->private_data = NULL; /* prevent ll_local_open assertion */
 517
 518         fd = ll_file_data_get();
 519         if (fd == NULL)
 520                 GOTO(out_openerr, rc = -ENOMEM);
 521
 522         fd->fd_file = file;
 523         if (S_ISDIR(inode->i_mode))
 524                 ll_authorize_statahead(inode, fd);
 525
 526         if (inode->i_sb->s_root == file->f_path.dentry) {
 527                 LUSTRE_FPRIVATE(file) = fd;
 528                 RETURN(0);
 529         }
 530
 531         if (!it || !it->d.lustre.it_disposition) {
 532                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 533                  * because everything but O_ACCMODE mask was stripped from
 534                  * there */
 535                 if ((oit.it_flags + 1) & O_ACCMODE)
 536                         oit.it_flags++;
 537                 if (file->f_flags & O_TRUNC)
 538                         oit.it_flags |= FMODE_WRITE;
 539
 540                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 541                  * dentry_open after call to open_namei that checks permissions.
 542                  * Only nfsd_open call dentry_open directly without checking
 543                  * permissions and because of that this code below is safe. */
 544                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 545                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 546
 547                 /* We do not want O_EXCL here, presumably we opened the file
 548                  * already? XXX - NFS implications? */
 549                 oit.it_flags &= ~O_EXCL;
 550
 551                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 552                  * created if necessary, then "IT_CREAT" should be set to keep
 553                  * consistent with it */
 554                 if (oit.it_flags & O_CREAT)
 555                         oit.it_op |= IT_CREAT;
 556
 557                 it = &oit;
 558         }
 559
 560 restart:
 561         /* Let's see if we have file open on MDS already. */
 562         if (it->it_flags & FMODE_WRITE) {
 563                 och_p = &lli->lli_mds_write_och;
 564                 och_usecount = &lli->lli_open_fd_write_count;
 565         } else if (it->it_flags & FMODE_EXEC) {
 566                 och_p = &lli->lli_mds_exec_och;
 567                 och_usecount = &lli->lli_open_fd_exec_count;
 568          } else {
 569                 och_p = &lli->lli_mds_read_och;
 570                 och_usecount = &lli->lli_open_fd_read_count;
 571         }
 572
 573         mutex_lock(&lli->lli_och_mutex);
 574         if (*och_p) { /* Open handle is present */
 575                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 576                         /* Well, there's extra open request that we do not need,
 577                            let's close it somehow. This will decref request. */
 578                         rc = it_open_error(DISP_OPEN_OPEN, it);
 579                         if (rc) {
 580                                 mutex_unlock(&lli->lli_och_mutex);
 581                                 GOTO(out_openerr, rc);
 582                         }
 583
 584                         ll_release_openhandle(file->f_path.dentry, it);
 585                 }
 586                 (*och_usecount)++;
 587
 588                 rc = ll_local_open(file, it, fd, NULL);
 589                 if (rc) {
 590                         (*och_usecount)--;
 591                         mutex_unlock(&lli->lli_och_mutex);
 592                         GOTO(out_openerr, rc);
 593                 }
 594         } else {
 595                 LASSERT(*och_usecount == 0);
 596                 if (!it->d.lustre.it_disposition) {
 597                         /* We cannot just request lock handle now, new ELC code
 598                            means that one of other OPEN locks for this file
 599                            could be cancelled, and since blocking ast handler
 600                            would attempt to grab och_mutex as well, that would
 601                            result in a deadlock */
 602                         mutex_unlock(&lli->lli_och_mutex);
 603                         /*
 604                          * Normally called under two situations:
 605                          * 1. NFS export.
 606                          * 2. A race/condition on MDS resulting in no open
 607                          *    handle to be returned from LOOKUP|OPEN request,
 608                          *    for example if the target entry was a symlink.
 609                          *
 610                          * Always fetch MDS_OPEN_LOCK if this is not setstripe.
 611                          *
 612                          * Always specify MDS_OPEN_BY_FID because we don't want
 613                          * to get file with different fid.
 614                          */
 615                         it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
 616                         rc = ll_intent_file_open(file, NULL, 0, it);
 617                         if (rc)
 618                                 GOTO(out_openerr, rc);
 619
 620                         goto restart;
 621                 }
 622                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 623                 if (!*och_p)
 624                         GOTO(out_och_free, rc = -ENOMEM);
 625
 626                 (*och_usecount)++;
 627
 628                 /* md_intent_lock() didn't get a request ref if there was an
 629                  * open error, so don't do cleanup on the request here
 630                  * (bug 3430) */
 631                 /* XXX (green): Should not we bail out on any error here, not
 632                  * just open error? */
 633                 rc = it_open_error(DISP_OPEN_OPEN, it);
 634                 if (rc != 0)
 635                         GOTO(out_och_free, rc);
 636
 637                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 638                          "inode %p: disposition %x, status %d\n", inode,
 639                          it_disposition(it, ~0), it->d.lustre.it_status);
 640
 641                 rc = ll_local_open(file, it, fd, *och_p);
 642                 if (rc)
 643                         GOTO(out_och_free, rc);
 644         }
 645         mutex_unlock(&lli->lli_och_mutex);
 646         fd = NULL;
 647
 648         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 649            different kind of OPEN lock for this same inode gets cancelled
 650            by ldlm_cancel_lru */
 651         if (!S_ISREG(inode->i_mode))
 652                 GOTO(out_och_free, rc);
 653
 654         cl_lov_delay_create_clear(&file->f_flags);
 655         GOTO(out_och_free, rc);
 656
 657 out_och_free:
 658         if (rc) {
 659                 if (och_p && *och_p) {
 660                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 661                         *och_p = NULL; /* OBD_FREE writes some magic there */
 662                         (*och_usecount)--;
 663                 }
 664                 mutex_unlock(&lli->lli_och_mutex);
 665
 666 out_openerr:
 667                 if (lli->lli_opendir_key == fd)
 668                         ll_deauthorize_statahead(inode, fd);
 669                 if (fd != NULL)
 670                         ll_file_data_put(fd);
 671         } else {
 672                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 673         }
 674
 675         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 676                 ptlrpc_req_finished(it->d.lustre.it_data);
 677                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 678         }
 679
 680         return rc;
 681 }
 682
 683 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 684                         struct ldlm_lock_desc *desc, void *data, int flag)
 685 {
 686         int rc;
 687         struct lustre_handle lockh;
 688         ENTRY;
 689
 690         switch (flag) {
 691         case LDLM_CB_BLOCKING:
 692                 ldlm_lock2handle(lock, &lockh);
 693                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 694                 if (rc < 0) {
 695                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 696                         RETURN(rc);
 697                 }
 698                 break;
 699         case LDLM_CB_CANCELING:
 700                 /* do nothing */
 701                 break;
 702         }
 703         RETURN(0);
 704 }
 705
 706 /**
 707  * Acquire a lease and open the file.
 708  */
 709 static struct obd_client_handle *
 710 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 711               __u64 open_flags)
 712 {
 713         struct lookup_intent it = { .it_op = IT_OPEN };
 714         struct ll_sb_info *sbi = ll_i2sbi(inode);
 715         struct md_op_data *op_data;
 716         struct ptlrpc_request *req = NULL;
 717         struct lustre_handle old_handle = { 0 };
 718         struct obd_client_handle *och = NULL;
 719         int rc;
 720         int rc2;
 721         ENTRY;
 722
 723         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 724                 RETURN(ERR_PTR(-EINVAL));
 725
 726         if (file != NULL) {
 727                 struct ll_inode_info *lli = ll_i2info(inode);
 728                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 729                 struct obd_client_handle **och_p;
 730                 __u64 *och_usecount;
 731
 732                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 733                         RETURN(ERR_PTR(-EPERM));
 734
 735                 /* Get the openhandle of the file */
 736                 rc = -EBUSY;
 737                 mutex_lock(&lli->lli_och_mutex);
 738                 if (fd->fd_lease_och != NULL) {
 739                         mutex_unlock(&lli->lli_och_mutex);
 740                         RETURN(ERR_PTR(rc));
 741                 }
 742
 743                 if (fd->fd_och == NULL) {
 744                         if (file->f_mode & FMODE_WRITE) {
 745                                 LASSERT(lli->lli_mds_write_och != NULL);
 746                                 och_p = &lli->lli_mds_write_och;
 747                                 och_usecount = &lli->lli_open_fd_write_count;
 748                         } else {
 749                                 LASSERT(lli->lli_mds_read_och != NULL);
 750                                 och_p = &lli->lli_mds_read_och;
 751                                 och_usecount = &lli->lli_open_fd_read_count;
 752                         }
 753                         if (*och_usecount == 1) {
 754                                 fd->fd_och = *och_p;
 755                                 *och_p = NULL;
 756                                 *och_usecount = 0;
 757                                 rc = 0;
 758                         }
 759                 }
 760                 mutex_unlock(&lli->lli_och_mutex);
 761                 if (rc < 0) /* more than 1 opener */
 762                         RETURN(ERR_PTR(rc));
 763
 764                 LASSERT(fd->fd_och != NULL);
 765                 old_handle = fd->fd_och->och_fh;
 766         }
 767
 768         OBD_ALLOC_PTR(och);
 769         if (och == NULL)
 770                 RETURN(ERR_PTR(-ENOMEM));
 771
 772         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 773                                         LUSTRE_OPC_ANY, NULL);
 774         if (IS_ERR(op_data))
 775                 GOTO(out, rc = PTR_ERR(op_data));
 776
 777         /* To tell the MDT this openhandle is from the same owner */
 778         op_data->op_handle = old_handle;
 779
 780         it.it_flags = fmode | open_flags;
 781         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 782         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 783                             &ll_md_blocking_lease_ast,
 784         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 785          * it can be cancelled which may mislead applications that the lease is
 786          * broken;
 787          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 788          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 789          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 790                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 791         ll_finish_md_op_data(op_data);
 792         ptlrpc_req_finished(req);
 793         if (rc < 0)
 794                 GOTO(out_release_it, rc);
 795
 796         if (it_disposition(&it, DISP_LOOKUP_NEG))
 797                 GOTO(out_release_it, rc = -ENOENT);
 798
 799         rc = it_open_error(DISP_OPEN_OPEN, &it);
 800         if (rc)
 801                 GOTO(out_release_it, rc);
 802
 803         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 804         ll_och_fill(sbi->ll_md_exp, &it, och);
 805
 806         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 807                 GOTO(out_close, rc = -EOPNOTSUPP);
 808
 809         /* already get lease, handle lease lock */
 810         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 811         if (it.d.lustre.it_lock_mode == 0 ||
 812             it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 813                 /* open lock must return for lease */
 814                 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
 815                         PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 816                         it.d.lustre.it_lock_bits);
 817                 GOTO(out_close, rc = -EPROTO);
 818         }
 819
 820         ll_intent_release(&it);
 821         RETURN(och);
 822
 823 out_close:
 824         /* Cancel open lock */
 825         if (it.d.lustre.it_lock_mode != 0) {
 826                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 827                                             it.d.lustre.it_lock_mode);
 828                 it.d.lustre.it_lock_mode = 0;
 829                 och->och_lease_handle.cookie = 0ULL;
 830         }
 831         rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
 832         if (rc2 < 0)
 833                 CERROR("%s: error closing file "DFID": %d\n",
 834                        ll_get_fsname(inode->i_sb, NULL, 0),
 835                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 836         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 837 out_release_it:
 838         ll_intent_release(&it);
 839 out:
 840         if (och != NULL)
 841                 OBD_FREE_PTR(och);
 842         RETURN(ERR_PTR(rc));
 843 }
 844
 845 /**
 846  * Check whether a layout swap can be done between two inodes.
 847  *
 848  * \param[in] inode1  First inode to check
 849  * \param[in] inode2  Second inode to check
 850  *
 851  * \retval 0 on success, layout swap can be performed between both inodes
 852  * \retval negative error code if requirements are not met
 853  */
 854 static int ll_check_swap_layouts_validity(struct inode *inode1,
 855                                           struct inode *inode2)
 856 {
 857         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 858                 return -EINVAL;
 859
 860         if (inode_permission(inode1, MAY_WRITE) ||
 861             inode_permission(inode2, MAY_WRITE))
 862                 return -EPERM;
 863
 864         if (inode1->i_sb != inode2->i_sb)
 865                 return -EXDEV;
 866
 867         return 0;
 868 }
 869
 870 static int ll_swap_layouts_close(struct obd_client_handle *och,
 871                                  struct inode *inode, struct inode *inode2)
 872 {
 873         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 874         const struct lu_fid     *fid2;
 875         int                      rc;
 876         ENTRY;
 877
 878         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 879                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 880
 881         rc = ll_check_swap_layouts_validity(inode, inode2);
 882         if (rc < 0)
 883                 GOTO(out_free_och, rc);
 884
 885         /* We now know that inode2 is a lustre inode */
 886         fid2 = ll_inode2fid(inode2);
 887
 888         rc = lu_fid_cmp(fid1, fid2);
 889         if (rc == 0)
 890                 GOTO(out_free_och, rc = -EINVAL);
 891
 892         /* Close the file and swap layouts between inode & inode2.
 893          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 894          * because we still need it to pack l_remote_handle to MDT. */
 895         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
 896                                        MDS_CLOSE_LAYOUT_SWAP, inode2);
 897
 898         och = NULL; /* freed in ll_close_inode_openhandle() */
 899
 900 out_free_och:
 901         if (och != NULL)
 902                 OBD_FREE_PTR(och);
 903
 904         RETURN(rc);
 905 }
 906
 907 /**
 908  * Release lease and close the file.
 909  * It will check if the lease has ever broken.
 910  */
 911 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 912                           bool *lease_broken)
 913 {
 914         struct ldlm_lock *lock;
 915         bool cancelled = true;
 916         int rc;
 917         ENTRY;
 918
 919         lock = ldlm_handle2lock(&och->och_lease_handle);
 920         if (lock != NULL) {
 921                 lock_res_and_lock(lock);
 922                 cancelled = ldlm_is_cancel(lock);
 923                 unlock_res_and_lock(lock);
 924                 LDLM_LOCK_PUT(lock);
 925         }
 926
 927         CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 928                 PFID(&ll_i2info(inode)->lli_fid), cancelled);
 929
 930         if (!cancelled)
 931                 ldlm_cli_cancel(&och->och_lease_handle, 0);
 932         if (lease_broken != NULL)
 933                 *lease_broken = cancelled;
 934
 935         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
 936                                        0, NULL);
 937
 938         RETURN(rc);
 939 }
 940
 941 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 942 {
 943         struct ll_inode_info *lli = ll_i2info(inode);
 944         struct cl_object *obj = lli->lli_clob;
 945         struct cl_attr *attr = vvp_env_thread_attr(env);
 946         s64 atime;
 947         s64 mtime;
 948         s64 ctime;
 949         int rc = 0;
 950
 951         ENTRY;
 952
 953         ll_inode_size_lock(inode);
 954
 955         /* merge timestamps the most recently obtained from mds with
 956            timestamps obtained from osts */
 957         LTIME_S(inode->i_atime) = lli->lli_atime;
 958         LTIME_S(inode->i_mtime) = lli->lli_mtime;
 959         LTIME_S(inode->i_ctime) = lli->lli_ctime;
 960
 961         atime = LTIME_S(inode->i_atime);
 962         mtime = LTIME_S(inode->i_mtime);
 963         ctime = LTIME_S(inode->i_ctime);
 964
 965         cl_object_attr_lock(obj);
 966         rc = cl_object_attr_get(env, obj, attr);
 967         cl_object_attr_unlock(obj);
 968
 969         if (rc != 0)
 970                 GOTO(out_size_unlock, rc);
 971
 972         if (atime < attr->cat_atime)
 973                 atime = attr->cat_atime;
 974
 975         if (ctime < attr->cat_ctime)
 976                 ctime = attr->cat_ctime;
 977
 978         if (mtime < attr->cat_mtime)
 979                 mtime = attr->cat_mtime;
 980
 981         CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 982                PFID(&lli->lli_fid), attr->cat_size);
 983
 984         i_size_write(inode, attr->cat_size);
 985         inode->i_blocks = attr->cat_blocks;
 986
 987         LTIME_S(inode->i_atime) = atime;
 988         LTIME_S(inode->i_mtime) = mtime;
 989         LTIME_S(inode->i_ctime) = ctime;
 990
 991 out_size_unlock:
 992         ll_inode_size_unlock(inode);
 993
 994         RETURN(rc);
 995 }
 996
 997 static bool file_is_noatime(const struct file *file)
 998 {
 999         const struct vfsmount *mnt = file->f_path.mnt;
1000         const struct inode *inode = file->f_path.dentry->d_inode;
1001
1002         /* Adapted from file_accessed() and touch_atime().*/
1003         if (file->f_flags & O_NOATIME)
1004                 return true;
1005
1006         if (inode->i_flags & S_NOATIME)
1007                 return true;
1008
1009         if (IS_NOATIME(inode))
1010                 return true;
1011
1012         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1013                 return true;
1014
1015         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1016                 return true;
1017
1018         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1019                 return true;
1020
1021         return false;
1022 }
1023
1024 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1025 {
1026         struct inode *inode = file->f_path.dentry->d_inode;
1027
1028         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1029         if (write) {
1030                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1031                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1032                                       file->f_flags & O_DIRECT ||
1033                                       IS_SYNC(inode);
1034         }
1035         io->ci_obj     = ll_i2info(inode)->lli_clob;
1036         io->ci_lockreq = CILR_MAYBE;
1037         if (ll_file_nolock(file)) {
1038                 io->ci_lockreq = CILR_NEVER;
1039                 io->ci_no_srvlock = 1;
1040         } else if (file->f_flags & O_APPEND) {
1041                 io->ci_lockreq = CILR_MANDATORY;
1042         }
1043
1044         io->ci_noatime = file_is_noatime(file);
1045 }
1046
1047 static ssize_t
1048 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1049                    struct file *file, enum cl_io_type iot,
1050                    loff_t *ppos, size_t count)
1051 {
1052         struct vvp_io           *vio = vvp_env_io(env);
1053         struct inode            *inode = file->f_path.dentry->d_inode;
1054         struct ll_inode_info    *lli = ll_i2info(inode);
1055         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1056         struct cl_io            *io;
1057         ssize_t                 result = 0;
1058         int                     rc = 0;
1059         struct range_lock       range;
1060
1061         ENTRY;
1062
1063         CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1064                 file->f_path.dentry->d_name.name, iot, *ppos, count);
1065
1066 restart:
1067         io = vvp_env_thread_io(env);
1068         ll_io_init(io, file, iot == CIT_WRITE);
1069
1070         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1071                 bool range_locked = false;
1072
1073                 if (file->f_flags & O_APPEND)
1074                         range_lock_init(&range, 0, LUSTRE_EOF);
1075                 else
1076                         range_lock_init(&range, *ppos, *ppos + count - 1);
1077
1078                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1079                 vio->vui_io_subtype = args->via_io_subtype;
1080
1081                 switch (vio->vui_io_subtype) {
1082                 case IO_NORMAL:
1083                         vio->vui_iov = args->u.normal.via_iov;
1084                         vio->vui_nrsegs = args->u.normal.via_nrsegs;
1085                         vio->vui_tot_nrsegs = vio->vui_nrsegs;
1086                         vio->vui_iocb = args->u.normal.via_iocb;
1087                         /* Direct IO reads must also take range lock,
1088                          * or multiple reads will try to work on the same pages
1089                          * See LU-6227 for details. */
1090                         if (((iot == CIT_WRITE) ||
1091                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1092                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1093                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1094                                        RL_PARA(&range));
1095                                 rc = range_lock(&lli->lli_write_tree, &range);
1096                                 if (rc < 0)
1097                                         GOTO(out, rc);
1098
1099                                 range_locked = true;
1100                         }
1101                         break;
1102                 case IO_SPLICE:
1103                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1104                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1105                         break;
1106                 default:
1107                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1108                         LBUG();
1109                 }
1110
1111                 ll_cl_add(file, env, io);
1112                 rc = cl_io_loop(env, io);
1113                 ll_cl_remove(file, env);
1114
1115                 if (range_locked) {
1116                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1117                                RL_PARA(&range));
1118                         range_unlock(&lli->lli_write_tree, &range);
1119                 }
1120         } else {
1121                 /* cl_io_rw_init() handled IO */
1122                 rc = io->ci_result;
1123         }
1124
1125         if (io->ci_nob > 0) {
1126                 result += io->ci_nob;
1127                 count -= io->ci_nob;
1128                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1129
1130                 /* prepare IO restart */
1131                 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1132                         args->u.normal.via_iov = vio->vui_iov;
1133                         args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1134                 }
1135         }
1136         GOTO(out, rc);
1137 out:
1138         cl_io_fini(env, io);
1139
1140         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1141                 CDEBUG(D_VFSTRACE,
1142                        "%s: restart %s from %lld, count:%zu, result: %zd\n",
1143                        file->f_path.dentry->d_name.name,
1144                        iot == CIT_READ ? "read" : "write",
1145                        *ppos, count, result);
1146                 goto restart;
1147         }
1148
1149         if (iot == CIT_READ) {
1150                 if (result > 0)
1151                         ll_stats_ops_tally(ll_i2sbi(inode),
1152                                            LPROC_LL_READ_BYTES, result);
1153         } else if (iot == CIT_WRITE) {
1154                 if (result > 0) {
1155                         ll_stats_ops_tally(ll_i2sbi(inode),
1156                                            LPROC_LL_WRITE_BYTES, result);
1157                         fd->fd_write_failed = false;
1158                 } else if (rc != -ERESTARTSYS) {
1159                         fd->fd_write_failed = true;
1160                 }
1161         }
1162
1163         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1164
1165         return result > 0 ? result : rc;
1166 }
1167
1168 /*
1169  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1170  */
1171 static int ll_file_get_iov_count(const struct iovec *iov,
1172                                  unsigned long *nr_segs, size_t *count)
1173 {
1174         size_t cnt = 0;
1175         unsigned long seg;
1176
1177         for (seg = 0; seg < *nr_segs; seg++) {
1178                 const struct iovec *iv = &iov[seg];
1179
1180                 /*
1181                  * If any segment has a negative length, or the cumulative
1182                  * length ever wraps negative then return -EINVAL.
1183                  */
1184                 cnt += iv->iov_len;
1185                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1186                         return -EINVAL;
1187                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1188                         continue;
1189                 if (seg == 0)
1190                         return -EFAULT;
1191                 *nr_segs = seg;
1192                 cnt -= iv->iov_len;   /* This segment is no good */
1193                 break;
1194         }
1195         *count = cnt;
1196         return 0;
1197 }
1198
1199 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200                                 unsigned long nr_segs, loff_t pos)
1201 {
1202         struct lu_env      *env;
1203         struct vvp_io_args *args;
1204         struct iovec       *local_iov;
1205         size_t              count;
1206         ssize_t             result;
1207         int                 refcheck;
1208         ENTRY;
1209
1210         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1211         if (result)
1212                 RETURN(result);
1213
1214         env = cl_env_get(&refcheck);
1215         if (IS_ERR(env))
1216                 RETURN(PTR_ERR(env));
1217
1218         if (nr_segs == 1) {
1219                 local_iov = &ll_env_info(env)->lti_local_iov;
1220                 *local_iov = *iov;
1221         } else {
1222                 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1223                 if (local_iov == NULL) {
1224                         cl_env_put(env, &refcheck);
1225                         RETURN(-ENOMEM);
1226                 }
1227
1228                 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1229         }
1230
1231         args = ll_env_args(env, IO_NORMAL);
1232         args->u.normal.via_iov = local_iov;
1233         args->u.normal.via_nrsegs = nr_segs;
1234         args->u.normal.via_iocb = iocb;
1235
1236         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1237                                     &iocb->ki_pos, count);
1238
1239         cl_env_put(env, &refcheck);
1240
1241         if (nr_segs > 1)
1242                 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1243
1244         RETURN(result);
1245 }
1246
1247 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1248                             loff_t *ppos)
1249 {
1250         struct lu_env *env;
1251         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1252         struct kiocb  *kiocb;
1253         ssize_t        result;
1254         int            refcheck;
1255         ENTRY;
1256
1257         env = cl_env_get(&refcheck);
1258         if (IS_ERR(env))
1259                 RETURN(PTR_ERR(env));
1260
1261         kiocb = &ll_env_info(env)->lti_kiocb;
1262         init_sync_kiocb(kiocb, file);
1263         kiocb->ki_pos = *ppos;
1264 #ifdef HAVE_KIOCB_KI_LEFT
1265         kiocb->ki_left = count;
1266 #elif defined(HAVE_KI_NBYTES)
1267         kiocb->ki_nbytes = count;
1268 #endif
1269
1270         result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1271         *ppos = kiocb->ki_pos;
1272
1273         cl_env_put(env, &refcheck);
1274         RETURN(result);
1275 }
1276
1277 /*
1278  * Write to a file (through the page cache).
1279  * AIO stuff
1280  */
1281 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1282                                  unsigned long nr_segs, loff_t pos)
1283 {
1284         struct lu_env      *env;
1285         struct vvp_io_args *args;
1286         struct iovec       *local_iov;
1287         size_t              count;
1288         ssize_t             result;
1289         int                 refcheck;
1290         ENTRY;
1291
1292         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1293         if (result)
1294                 RETURN(result);
1295
1296         env = cl_env_get(&refcheck);
1297         if (IS_ERR(env))
1298                 RETURN(PTR_ERR(env));
1299
1300         if (nr_segs == 1) {
1301                 local_iov = &ll_env_info(env)->lti_local_iov;
1302                 *local_iov = *iov;
1303         } else {
1304                 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1305                 if (local_iov == NULL) {
1306                         cl_env_put(env, &refcheck);
1307                         RETURN(-ENOMEM);
1308                 }
1309
1310                 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1311         }
1312
1313         args = ll_env_args(env, IO_NORMAL);
1314         args->u.normal.via_iov = local_iov;
1315         args->u.normal.via_nrsegs = nr_segs;
1316         args->u.normal.via_iocb = iocb;
1317
1318         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1319                                   &iocb->ki_pos, count);
1320         cl_env_put(env, &refcheck);
1321
1322         if (nr_segs > 1)
1323                 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1324
1325         RETURN(result);
1326 }
1327
1328 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1329                              size_t count, loff_t *ppos)
1330 {
1331         struct lu_env *env;
1332         struct iovec   iov = { .iov_base = (void __user *)buf,
1333                                .iov_len = count };
1334         struct kiocb  *kiocb;
1335         ssize_t        result;
1336         int            refcheck;
1337         ENTRY;
1338
1339         env = cl_env_get(&refcheck);
1340         if (IS_ERR(env))
1341                 RETURN(PTR_ERR(env));
1342
1343         kiocb = &ll_env_info(env)->lti_kiocb;
1344         init_sync_kiocb(kiocb, file);
1345         kiocb->ki_pos = *ppos;
1346 #ifdef HAVE_KIOCB_KI_LEFT
1347         kiocb->ki_left = count;
1348 #elif defined(HAVE_KI_NBYTES)
1349         kiocb->ki_nbytes = count;
1350 #endif
1351
1352         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1353         *ppos = kiocb->ki_pos;
1354
1355         cl_env_put(env, &refcheck);
1356         RETURN(result);
1357 }
1358
1359 /*
1360  * Send file content (through pagecache) somewhere with helper
1361  */
1362 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1363                                    struct pipe_inode_info *pipe, size_t count,
1364                                    unsigned int flags)
1365 {
1366         struct lu_env      *env;
1367         struct vvp_io_args *args;
1368         ssize_t             result;
1369         int                 refcheck;
1370         ENTRY;
1371
1372         env = cl_env_get(&refcheck);
1373         if (IS_ERR(env))
1374                 RETURN(PTR_ERR(env));
1375
1376         args = ll_env_args(env, IO_SPLICE);
1377         args->u.splice.via_pipe = pipe;
1378         args->u.splice.via_flags = flags;
1379
1380         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1381         cl_env_put(env, &refcheck);
1382         RETURN(result);
1383 }
1384
1385 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1386                              __u64  flags, struct lov_user_md *lum,
1387                              int lum_size)
1388 {
1389         struct lookup_intent oit = {
1390                 .it_op = IT_OPEN,
1391                 .it_flags = flags | MDS_OPEN_BY_FID,
1392         };
1393         int rc;
1394         ENTRY;
1395
1396         ll_inode_size_lock(inode);
1397         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1398         if (rc < 0)
1399                 GOTO(out_unlock, rc);
1400
1401         ll_release_openhandle(file->f_path.dentry, &oit);
1402
1403 out_unlock:
1404         ll_inode_size_unlock(inode);
1405         ll_intent_release(&oit);
1406         cl_lov_delay_create_clear(&file->f_flags);
1407
1408         RETURN(rc);
1409 }
1410
1411 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1412                              struct lov_mds_md **lmmp, int *lmm_size,
1413                              struct ptlrpc_request **request)
1414 {
1415         struct ll_sb_info *sbi = ll_i2sbi(inode);
1416         struct mdt_body  *body;
1417         struct lov_mds_md *lmm = NULL;
1418         struct ptlrpc_request *req = NULL;
1419         struct md_op_data *op_data;
1420         int rc, lmmsize;
1421
1422         rc = ll_get_default_mdsize(sbi, &lmmsize);
1423         if (rc)
1424                 RETURN(rc);
1425
1426         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1427                                      strlen(filename), lmmsize,
1428                                      LUSTRE_OPC_ANY, NULL);
1429         if (IS_ERR(op_data))
1430                 RETURN(PTR_ERR(op_data));
1431
1432         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1433         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1434         ll_finish_md_op_data(op_data);
1435         if (rc < 0) {
1436                 CDEBUG(D_INFO, "md_getattr_name failed "
1437                        "on %s: rc %d\n", filename, rc);
1438                 GOTO(out, rc);
1439         }
1440
1441         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1442         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1443
1444         lmmsize = body->mbo_eadatasize;
1445
1446         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1447                         lmmsize == 0) {
1448                 GOTO(out, rc = -ENODATA);
1449         }
1450
1451         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1452         LASSERT(lmm != NULL);
1453
1454         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1455             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1456                 GOTO(out, rc = -EPROTO);
1457         }
1458
1459         /*
1460          * This is coming from the MDS, so is probably in
1461          * little endian.  We convert it to host endian before
1462          * passing it to userspace.
1463          */
1464         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1465                 int stripe_count;
1466
1467                 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1468                 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1469                         stripe_count = 0;
1470
1471                 /* if function called for directory - we should
1472                  * avoid swab not existent lsm objects */
1473                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1474                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1475                         if (S_ISREG(body->mbo_mode))
1476                                 lustre_swab_lov_user_md_objects(
1477                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1478                                     stripe_count);
1479                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1480                         lustre_swab_lov_user_md_v3(
1481                                 (struct lov_user_md_v3 *)lmm);
1482                         if (S_ISREG(body->mbo_mode))
1483                                 lustre_swab_lov_user_md_objects(
1484                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1485                                  stripe_count);
1486                 }
1487         }
1488
1489 out:
1490         *lmmp = lmm;
1491         *lmm_size = lmmsize;
1492         *request = req;
1493         return rc;
1494 }
1495
1496 static int ll_lov_setea(struct inode *inode, struct file *file,
1497                             unsigned long arg)
1498 {
1499         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1500         struct lov_user_md      *lump;
1501         int                      lum_size = sizeof(struct lov_user_md) +
1502                                             sizeof(struct lov_user_ost_data);
1503         int                      rc;
1504         ENTRY;
1505
1506         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1507                 RETURN(-EPERM);
1508
1509         OBD_ALLOC_LARGE(lump, lum_size);
1510         if (lump == NULL)
1511                 RETURN(-ENOMEM);
1512
1513         if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1514                 OBD_FREE_LARGE(lump, lum_size);
1515                 RETURN(-EFAULT);
1516         }
1517
1518         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1519
1520         OBD_FREE_LARGE(lump, lum_size);
1521         RETURN(rc);
1522 }
1523
1524 static int ll_file_getstripe(struct inode *inode,
1525                              struct lov_user_md __user *lum)
1526 {
1527         struct lu_env   *env;
1528         int             refcheck;
1529         int             rc;
1530         ENTRY;
1531
1532         env = cl_env_get(&refcheck);
1533         if (IS_ERR(env))
1534                 RETURN(PTR_ERR(env));
1535
1536         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1537         cl_env_put(env, &refcheck);
1538         RETURN(rc);
1539 }
1540
1541 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1542                             unsigned long arg)
1543 {
1544         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1545         struct lov_user_md        *klum;
1546         int                        lum_size, rc;
1547         __u64                      flags = FMODE_WRITE;
1548         ENTRY;
1549
1550         rc = ll_copy_user_md(lum, &klum);
1551         if (rc < 0)
1552                 RETURN(rc);
1553
1554         lum_size = rc;
1555         rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1556         if (rc == 0) {
1557                 __u32 gen;
1558
1559                 put_user(0, &lum->lmm_stripe_count);
1560
1561                 ll_layout_refresh(inode, &gen);
1562                 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1563         }
1564
1565         OBD_FREE(klum, lum_size);
1566         RETURN(rc);
1567 }
1568
1569 static int
1570 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1571 {
1572         struct ll_inode_info   *lli = ll_i2info(inode);
1573         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1574         struct ll_grouplock     grouplock;
1575         int                     rc;
1576         ENTRY;
1577
1578         if (arg == 0) {
1579                 CWARN("group id for group lock must not be 0\n");
1580                 RETURN(-EINVAL);
1581         }
1582
1583         if (ll_file_nolock(file))
1584                 RETURN(-EOPNOTSUPP);
1585
1586         spin_lock(&lli->lli_lock);
1587         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1588                 CWARN("group lock already existed with gid %lu\n",
1589                       fd->fd_grouplock.lg_gid);
1590                 spin_unlock(&lli->lli_lock);
1591                 RETURN(-EINVAL);
1592         }
1593         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1594         spin_unlock(&lli->lli_lock);
1595
1596         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1597                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1598         if (rc)
1599                 RETURN(rc);
1600
1601         spin_lock(&lli->lli_lock);
1602         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1603                 spin_unlock(&lli->lli_lock);
1604                 CERROR("another thread just won the race\n");
1605                 cl_put_grouplock(&grouplock);
1606                 RETURN(-EINVAL);
1607         }
1608
1609         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1610         fd->fd_grouplock = grouplock;
1611         spin_unlock(&lli->lli_lock);
1612
1613         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1614         RETURN(0);
1615 }
1616
1617 static int ll_put_grouplock(struct inode *inode, struct file *file,
1618                             unsigned long arg)
1619 {
1620         struct ll_inode_info   *lli = ll_i2info(inode);
1621         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1622         struct ll_grouplock     grouplock;
1623         ENTRY;
1624
1625         spin_lock(&lli->lli_lock);
1626         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1627                 spin_unlock(&lli->lli_lock);
1628                 CWARN("no group lock held\n");
1629                 RETURN(-EINVAL);
1630         }
1631
1632         LASSERT(fd->fd_grouplock.lg_lock != NULL);
1633
1634         if (fd->fd_grouplock.lg_gid != arg) {
1635                 CWARN("group lock %lu doesn't match current id %lu\n",
1636                       arg, fd->fd_grouplock.lg_gid);
1637                 spin_unlock(&lli->lli_lock);
1638                 RETURN(-EINVAL);
1639         }
1640
1641         grouplock = fd->fd_grouplock;
1642         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1643         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1644         spin_unlock(&lli->lli_lock);
1645
1646         cl_put_grouplock(&grouplock);
1647         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1648         RETURN(0);
1649 }
1650
1651 /**
1652  * Close inode open handle
1653  *
1654  * \param dentry [in]     dentry which contains the inode
1655  * \param it     [in,out] intent which contains open info and result
1656  *
1657  * \retval 0     success
1658  * \retval <0    failure
1659  */
1660 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1661 {
1662         struct inode *inode = dentry->d_inode;
1663         struct obd_client_handle *och;
1664         int rc;
1665         ENTRY;
1666
1667         LASSERT(inode);
1668
1669         /* Root ? Do nothing. */
1670         if (dentry->d_inode->i_sb->s_root == dentry)
1671                 RETURN(0);
1672
1673         /* No open handle to close? Move away */
1674         if (!it_disposition(it, DISP_OPEN_OPEN))
1675                 RETURN(0);
1676
1677         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1678
1679         OBD_ALLOC(och, sizeof(*och));
1680         if (!och)
1681                 GOTO(out, rc = -ENOMEM);
1682
1683         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1684
1685         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1686                                        och, inode, 0, NULL);
1687 out:
1688         /* this one is in place of ll_file_open */
1689         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1690                 ptlrpc_req_finished(it->d.lustre.it_data);
1691                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1692         }
1693         RETURN(rc);
1694 }
1695
1696 /**
1697  * Get size for inode for which FIEMAP mapping is requested.
1698  * Make the FIEMAP get_info call and returns the result.
1699  * \param fiemap        kernel buffer to hold extens
1700  * \param num_bytes     kernel buffer size
1701  */
1702 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1703                         size_t num_bytes)
1704 {
1705         struct lu_env                   *env;
1706         int                             refcheck;
1707         int                             rc = 0;
1708         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
1709         ENTRY;
1710
1711         /* Checks for fiemap flags */
1712         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1713                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1714                 return -EBADR;
1715         }
1716
1717         /* Check for FIEMAP_FLAG_SYNC */
1718         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1719                 rc = filemap_fdatawrite(inode->i_mapping);
1720                 if (rc)
1721                         return rc;
1722         }
1723
1724         env = cl_env_get(&refcheck);
1725         if (IS_ERR(env))
1726                 RETURN(PTR_ERR(env));
1727
1728         if (i_size_read(inode) == 0) {
1729                 rc = ll_glimpse_size(inode);
1730                 if (rc)
1731                         GOTO(out, rc);
1732         }
1733
1734         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1735         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1736         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1737
1738         /* If filesize is 0, then there would be no objects for mapping */
1739         if (fmkey.lfik_oa.o_size == 0) {
1740                 fiemap->fm_mapped_extents = 0;
1741                 GOTO(out, rc = 0);
1742         }
1743
1744         fmkey.lfik_fiemap = *fiemap;
1745
1746         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1747                               &fmkey, fiemap, &num_bytes);
1748 out:
1749         cl_env_put(env, &refcheck);
1750         RETURN(rc);
1751 }
1752
1753 int ll_fid2path(struct inode *inode, void __user *arg)
1754 {
1755         struct obd_export       *exp = ll_i2mdexp(inode);
1756         const struct getinfo_fid2path __user *gfin = arg;
1757         __u32                    pathlen;
1758         struct getinfo_fid2path *gfout;
1759         size_t                   outsize;
1760         int                      rc;
1761
1762         ENTRY;
1763
1764         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1765             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1766                 RETURN(-EPERM);
1767
1768         /* Only need to get the buflen */
1769         if (get_user(pathlen, &gfin->gf_pathlen))
1770                 RETURN(-EFAULT);
1771
1772         if (pathlen > PATH_MAX)
1773                 RETURN(-EINVAL);
1774
1775         outsize = sizeof(*gfout) + pathlen;
1776         OBD_ALLOC(gfout, outsize);
1777         if (gfout == NULL)
1778                 RETURN(-ENOMEM);
1779
1780         if (copy_from_user(gfout, arg, sizeof(*gfout)))
1781                 GOTO(gf_free, rc = -EFAULT);
1782
1783         /* Call mdc_iocontrol */
1784         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1785         if (rc != 0)
1786                 GOTO(gf_free, rc);
1787
1788         if (copy_to_user(arg, gfout, outsize))
1789                 rc = -EFAULT;
1790
1791 gf_free:
1792         OBD_FREE(gfout, outsize);
1793         RETURN(rc);
1794 }
1795
1796 /*
1797  * Read the data_version for inode.
1798  *
1799  * This value is computed using stripe object version on OST.
1800  * Version is computed using server side locking.
1801  *
1802  * @param flags if do sync on the OST side;
1803  *              0: no sync
1804  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1805  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1806  */
1807 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1808 {
1809         struct cl_object *obj = ll_i2info(inode)->lli_clob;
1810         struct lu_env *env;
1811         struct cl_io *io;
1812         int refcheck;
1813         int result;
1814
1815         ENTRY;
1816
1817         /* If no file object initialized, we consider its version is 0. */
1818         if (obj == NULL) {
1819                 *data_version = 0;
1820                 RETURN(0);
1821         }
1822
1823         env = cl_env_get(&refcheck);
1824         if (IS_ERR(env))
1825                 RETURN(PTR_ERR(env));
1826
1827         io = vvp_env_thread_io(env);
1828         io->ci_obj = obj;
1829         io->u.ci_data_version.dv_data_version = 0;
1830         io->u.ci_data_version.dv_flags = flags;
1831
1832 restart:
1833         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1834                 result = cl_io_loop(env, io);
1835         else
1836                 result = io->ci_result;
1837
1838         *data_version = io->u.ci_data_version.dv_data_version;
1839
1840         cl_io_fini(env, io);
1841
1842         if (unlikely(io->ci_need_restart))
1843                 goto restart;
1844
1845         cl_env_put(env, &refcheck);
1846
1847         RETURN(result);
1848 }
1849
1850 /*
1851  * Trigger a HSM release request for the provided inode.
1852  */
1853 int ll_hsm_release(struct inode *inode)
1854 {
1855         struct cl_env_nest nest;
1856         struct lu_env *env;
1857         struct obd_client_handle *och = NULL;
1858         __u64 data_version = 0;
1859         int rc;
1860         ENTRY;
1861
1862         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1863                ll_get_fsname(inode->i_sb, NULL, 0),
1864                PFID(&ll_i2info(inode)->lli_fid));
1865
1866         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1867         if (IS_ERR(och))
1868                 GOTO(out, rc = PTR_ERR(och));
1869
1870         /* Grab latest data_version and [am]time values */
1871         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1872         if (rc != 0)
1873                 GOTO(out, rc);
1874
1875         env = cl_env_nested_get(&nest);
1876         if (IS_ERR(env))
1877                 GOTO(out, rc = PTR_ERR(env));
1878
1879         ll_merge_attr(env, inode);
1880         cl_env_nested_put(&nest, env);
1881
1882         /* Release the file.
1883          * NB: lease lock handle is released in mdc_hsm_release_pack() because
1884          * we still need it to pack l_remote_handle to MDT. */
1885         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1886                                        MDS_HSM_RELEASE, &data_version);
1887         och = NULL;
1888
1889         EXIT;
1890 out:
1891         if (och != NULL && !IS_ERR(och)) /* close the file */
1892                 ll_lease_close(och, inode, NULL);
1893
1894         return rc;
1895 }
1896
1897 struct ll_swap_stack {
1898         __u64                    dv1;
1899         __u64                    dv2;
1900         struct inode            *inode1;
1901         struct inode            *inode2;
1902         bool                     check_dv1;
1903         bool                     check_dv2;
1904 };
1905
1906 static int ll_swap_layouts(struct file *file1, struct file *file2,
1907                            struct lustre_swap_layouts *lsl)
1908 {
1909         struct mdc_swap_layouts  msl;
1910         struct md_op_data       *op_data;
1911         __u32                    gid;
1912         __u64                    dv;
1913         struct ll_swap_stack    *llss = NULL;
1914         int                      rc;
1915
1916         OBD_ALLOC_PTR(llss);
1917         if (llss == NULL)
1918                 RETURN(-ENOMEM);
1919
1920         llss->inode1 = file1->f_path.dentry->d_inode;
1921         llss->inode2 = file2->f_path.dentry->d_inode;
1922
1923         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1924         if (rc < 0)
1925                 GOTO(free, rc);
1926
1927         /* we use 2 bool because it is easier to swap than 2 bits */
1928         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1929                 llss->check_dv1 = true;
1930
1931         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1932                 llss->check_dv2 = true;
1933
1934         /* we cannot use lsl->sl_dvX directly because we may swap them */
1935         llss->dv1 = lsl->sl_dv1;
1936         llss->dv2 = lsl->sl_dv2;
1937
1938         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1939         if (rc == 0) /* same file, done! */
1940                 GOTO(free, rc);
1941
1942         if (rc < 0) { /* sequentialize it */
1943                 swap(llss->inode1, llss->inode2);
1944                 swap(file1, file2);
1945                 swap(llss->dv1, llss->dv2);
1946                 swap(llss->check_dv1, llss->check_dv2);
1947         }
1948
1949         gid = lsl->sl_gid;
1950         if (gid != 0) { /* application asks to flush dirty cache */
1951                 rc = ll_get_grouplock(llss->inode1, file1, gid);
1952                 if (rc < 0)
1953                         GOTO(free, rc);
1954
1955                 rc = ll_get_grouplock(llss->inode2, file2, gid);
1956                 if (rc < 0) {
1957                         ll_put_grouplock(llss->inode1, file1, gid);
1958                         GOTO(free, rc);
1959                 }
1960         }
1961
1962         /* ultimate check, before swaping the layouts we check if
1963          * dataversion has changed (if requested) */
1964         if (llss->check_dv1) {
1965                 rc = ll_data_version(llss->inode1, &dv, 0);
1966                 if (rc)
1967                         GOTO(putgl, rc);
1968                 if (dv != llss->dv1)
1969                         GOTO(putgl, rc = -EAGAIN);
1970         }
1971
1972         if (llss->check_dv2) {
1973                 rc = ll_data_version(llss->inode2, &dv, 0);
1974                 if (rc)
1975                         GOTO(putgl, rc);
1976                 if (dv != llss->dv2)
1977                         GOTO(putgl, rc = -EAGAIN);
1978         }
1979
1980         /* struct md_op_data is used to send the swap args to the mdt
1981          * only flags is missing, so we use struct mdc_swap_layouts
1982          * through the md_op_data->op_data */
1983         /* flags from user space have to be converted before they are send to
1984          * server, no flag is sent today, they are only used on the client */
1985         msl.msl_flags = 0;
1986         rc = -ENOMEM;
1987         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1988                                      0, LUSTRE_OPC_ANY, &msl);
1989         if (IS_ERR(op_data))
1990                 GOTO(free, rc = PTR_ERR(op_data));
1991
1992         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1993                            sizeof(*op_data), op_data, NULL);
1994         ll_finish_md_op_data(op_data);
1995
1996         if (rc < 0)
1997                 GOTO(putgl, rc);
1998
1999 putgl:
2000         if (gid != 0) {
2001                 ll_put_grouplock(llss->inode2, file2, gid);
2002                 ll_put_grouplock(llss->inode1, file1, gid);
2003         }
2004
2005 free:
2006         if (llss != NULL)
2007                 OBD_FREE_PTR(llss);
2008
2009         RETURN(rc);
2010 }
2011
2012 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2013 {
2014         struct md_op_data       *op_data;
2015         int                      rc;
2016         ENTRY;
2017
2018         /* Detect out-of range masks */
2019         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2020                 RETURN(-EINVAL);
2021
2022         /* Non-root users are forbidden to set or clear flags which are
2023          * NOT defined in HSM_USER_MASK. */
2024         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2025             !cfs_capable(CFS_CAP_SYS_ADMIN))
2026                 RETURN(-EPERM);
2027
2028         /* Detect out-of range archive id */
2029         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2030             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2031                 RETURN(-EINVAL);
2032
2033         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2034                                      LUSTRE_OPC_ANY, hss);
2035         if (IS_ERR(op_data))
2036                 RETURN(PTR_ERR(op_data));
2037
2038         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2039                            sizeof(*op_data), op_data, NULL);
2040
2041         ll_finish_md_op_data(op_data);
2042
2043         RETURN(rc);
2044 }
2045
2046 static int ll_hsm_import(struct inode *inode, struct file *file,
2047                          struct hsm_user_import *hui)
2048 {
2049         struct hsm_state_set    *hss = NULL;
2050         struct iattr            *attr = NULL;
2051         int                      rc;
2052         ENTRY;
2053
2054         if (!S_ISREG(inode->i_mode))
2055                 RETURN(-EINVAL);
2056
2057         /* set HSM flags */
2058         OBD_ALLOC_PTR(hss);
2059         if (hss == NULL)
2060                 GOTO(out, rc = -ENOMEM);
2061
2062         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2063         hss->hss_archive_id = hui->hui_archive_id;
2064         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2065         rc = ll_hsm_state_set(inode, hss);
2066         if (rc != 0)
2067                 GOTO(out, rc);
2068
2069         OBD_ALLOC_PTR(attr);
2070         if (attr == NULL)
2071                 GOTO(out, rc = -ENOMEM);
2072
2073         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2074         attr->ia_mode |= S_IFREG;
2075         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2076         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2077         attr->ia_size = hui->hui_size;
2078         attr->ia_mtime.tv_sec = hui->hui_mtime;
2079         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2080         attr->ia_atime.tv_sec = hui->hui_atime;
2081         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2082
2083         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2084                          ATTR_UID | ATTR_GID |
2085                          ATTR_MTIME | ATTR_MTIME_SET |
2086                          ATTR_ATIME | ATTR_ATIME_SET;
2087
2088         mutex_lock(&inode->i_mutex);
2089
2090         rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2091         if (rc == -ENODATA)
2092                 rc = 0;
2093
2094         mutex_unlock(&inode->i_mutex);
2095
2096 out:
2097         if (hss != NULL)
2098                 OBD_FREE_PTR(hss);
2099
2100         if (attr != NULL)
2101                 OBD_FREE_PTR(attr);
2102
2103         RETURN(rc);
2104 }
2105
2106 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2107 {
2108         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2109                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2110 }
2111
2112 static long
2113 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2114 {
2115         struct inode            *inode = file->f_path.dentry->d_inode;
2116         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2117         int                      flags, rc;
2118         ENTRY;
2119
2120         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2121                PFID(ll_inode2fid(inode)), inode, cmd);
2122         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2123
2124         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2125         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2126                 RETURN(-ENOTTY);
2127
2128         switch(cmd) {
2129         case LL_IOC_GETFLAGS:
2130                 /* Get the current value of the file flags */
2131                 return put_user(fd->fd_flags, (int __user *)arg);
2132         case LL_IOC_SETFLAGS:
2133         case LL_IOC_CLRFLAGS:
2134                 /* Set or clear specific file flags */
2135                 /* XXX This probably needs checks to ensure the flags are
2136                  *     not abused, and to handle any flag side effects.
2137                  */
2138                 if (get_user(flags, (int __user *) arg))
2139                         RETURN(-EFAULT);
2140
2141                 if (cmd == LL_IOC_SETFLAGS) {
2142                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2143                             !(file->f_flags & O_DIRECT)) {
2144                                 CERROR("%s: unable to disable locking on "
2145                                        "non-O_DIRECT file\n", current->comm);
2146                                 RETURN(-EINVAL);
2147                         }
2148
2149                         fd->fd_flags |= flags;
2150                 } else {
2151                         fd->fd_flags &= ~flags;
2152                 }
2153                 RETURN(0);
2154         case LL_IOC_LOV_SETSTRIPE:
2155                 RETURN(ll_lov_setstripe(inode, file, arg));
2156         case LL_IOC_LOV_SETEA:
2157                 RETURN(ll_lov_setea(inode, file, arg));
2158         case LL_IOC_LOV_SWAP_LAYOUTS: {
2159                 struct file *file2;
2160                 struct lustre_swap_layouts lsl;
2161
2162                 if (copy_from_user(&lsl, (char __user *)arg,
2163                                        sizeof(struct lustre_swap_layouts)))
2164                         RETURN(-EFAULT);
2165
2166                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2167                         RETURN(-EPERM);
2168
2169                 file2 = fget(lsl.sl_fd);
2170                 if (file2 == NULL)
2171                         RETURN(-EBADF);
2172
2173                 /* O_WRONLY or O_RDWR */
2174                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2175                         GOTO(out, rc = -EPERM);
2176
2177                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2178                         struct inode                    *inode2;
2179                         struct ll_inode_info            *lli;
2180                         struct obd_client_handle        *och = NULL;
2181
2182                         if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2183                                 GOTO(out, rc = -EINVAL);
2184
2185                         lli = ll_i2info(inode);
2186                         mutex_lock(&lli->lli_och_mutex);
2187                         if (fd->fd_lease_och != NULL) {
2188                                 och = fd->fd_lease_och;
2189                                 fd->fd_lease_och = NULL;
2190                         }
2191                         mutex_unlock(&lli->lli_och_mutex);
2192                         if (och == NULL)
2193                                 GOTO(out, rc = -ENOLCK);
2194                         inode2 = file2->f_path.dentry->d_inode;
2195                         rc = ll_swap_layouts_close(och, inode, inode2);
2196                 } else {
2197                         rc = ll_swap_layouts(file, file2, &lsl);
2198                 }
2199 out:
2200                 fput(file2);
2201                 RETURN(rc);
2202         }
2203         case LL_IOC_LOV_GETSTRIPE:
2204                 RETURN(ll_file_getstripe(inode,
2205                                          (struct lov_user_md __user *)arg));
2206         case FSFILT_IOC_GETFLAGS:
2207         case FSFILT_IOC_SETFLAGS:
2208                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2209         case FSFILT_IOC_GETVERSION_OLD:
2210         case FSFILT_IOC_GETVERSION:
2211                 RETURN(put_user(inode->i_generation, (int __user *)arg));
2212         case LL_IOC_GROUP_LOCK:
2213                 RETURN(ll_get_grouplock(inode, file, arg));
2214         case LL_IOC_GROUP_UNLOCK:
2215                 RETURN(ll_put_grouplock(inode, file, arg));
2216         case IOC_OBD_STATFS:
2217                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2218
2219         /* We need to special case any other ioctls we want to handle,
2220          * to send them to the MDS/OST as appropriate and to properly
2221          * network encode the arg field.
2222         case FSFILT_IOC_SETVERSION_OLD:
2223         case FSFILT_IOC_SETVERSION:
2224         */
2225         case LL_IOC_FLUSHCTX:
2226                 RETURN(ll_flush_ctx(inode));
2227         case LL_IOC_PATH2FID: {
2228                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2229                                  sizeof(struct lu_fid)))
2230                         RETURN(-EFAULT);
2231
2232                 RETURN(0);
2233         }
2234         case LL_IOC_GETPARENT:
2235                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2236
2237         case OBD_IOC_FID2PATH:
2238                 RETURN(ll_fid2path(inode, (void __user *)arg));
2239         case LL_IOC_DATA_VERSION: {
2240                 struct ioc_data_version idv;
2241                 int rc;
2242
2243                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2244                         RETURN(-EFAULT);
2245
2246                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2247                 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2248
2249                 if (rc == 0 &&
2250                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2251                         RETURN(-EFAULT);
2252
2253                 RETURN(rc);
2254         }
2255
2256         case LL_IOC_GET_MDTIDX: {
2257                 int mdtidx;
2258
2259                 mdtidx = ll_get_mdt_idx(inode);
2260                 if (mdtidx < 0)
2261                         RETURN(mdtidx);
2262
2263                 if (put_user((int)mdtidx, (int __user *)arg))
2264                         RETURN(-EFAULT);
2265
2266                 RETURN(0);
2267         }
2268         case OBD_IOC_GETDTNAME:
2269         case OBD_IOC_GETMDNAME:
2270                 RETURN(ll_get_obd_name(inode, cmd, arg));
2271         case LL_IOC_HSM_STATE_GET: {
2272                 struct md_op_data       *op_data;
2273                 struct hsm_user_state   *hus;
2274                 int                      rc;
2275
2276                 OBD_ALLOC_PTR(hus);
2277                 if (hus == NULL)
2278                         RETURN(-ENOMEM);
2279
2280                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2281                                              LUSTRE_OPC_ANY, hus);
2282                 if (IS_ERR(op_data)) {
2283                         OBD_FREE_PTR(hus);
2284                         RETURN(PTR_ERR(op_data));
2285                 }
2286
2287                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2288                                    op_data, NULL);
2289
2290                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2291                         rc = -EFAULT;
2292
2293                 ll_finish_md_op_data(op_data);
2294                 OBD_FREE_PTR(hus);
2295                 RETURN(rc);
2296         }
2297         case LL_IOC_HSM_STATE_SET: {
2298                 struct hsm_state_set    *hss;
2299                 int                      rc;
2300
2301                 OBD_ALLOC_PTR(hss);
2302                 if (hss == NULL)
2303                         RETURN(-ENOMEM);
2304
2305                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2306                         OBD_FREE_PTR(hss);
2307                         RETURN(-EFAULT);
2308                 }
2309
2310                 rc = ll_hsm_state_set(inode, hss);
2311
2312                 OBD_FREE_PTR(hss);
2313                 RETURN(rc);
2314         }
2315         case LL_IOC_HSM_ACTION: {
2316                 struct md_op_data               *op_data;
2317                 struct hsm_current_action       *hca;
2318                 int                              rc;
2319
2320                 OBD_ALLOC_PTR(hca);
2321                 if (hca == NULL)
2322                         RETURN(-ENOMEM);
2323
2324                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2325                                              LUSTRE_OPC_ANY, hca);
2326                 if (IS_ERR(op_data)) {
2327                         OBD_FREE_PTR(hca);
2328                         RETURN(PTR_ERR(op_data));
2329                 }
2330
2331                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2332                                    op_data, NULL);
2333
2334                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2335                         rc = -EFAULT;
2336
2337                 ll_finish_md_op_data(op_data);
2338                 OBD_FREE_PTR(hca);
2339                 RETURN(rc);
2340         }
2341         case LL_IOC_SET_LEASE: {
2342                 struct ll_inode_info *lli = ll_i2info(inode);
2343                 struct obd_client_handle *och = NULL;
2344                 bool lease_broken;
2345                 fmode_t fmode;
2346
2347                 switch (arg) {
2348                 case LL_LEASE_WRLCK:
2349                         if (!(file->f_mode & FMODE_WRITE))
2350                                 RETURN(-EPERM);
2351                         fmode = FMODE_WRITE;
2352                         break;
2353                 case LL_LEASE_RDLCK:
2354                         if (!(file->f_mode & FMODE_READ))
2355                                 RETURN(-EPERM);
2356                         fmode = FMODE_READ;
2357                         break;
2358                 case LL_LEASE_UNLCK:
2359                         mutex_lock(&lli->lli_och_mutex);
2360                         if (fd->fd_lease_och != NULL) {
2361                                 och = fd->fd_lease_och;
2362                                 fd->fd_lease_och = NULL;
2363                         }
2364                         mutex_unlock(&lli->lli_och_mutex);
2365
2366                         if (och == NULL)
2367                                 RETURN(-ENOLCK);
2368
2369                         fmode = och->och_flags;
2370                         rc = ll_lease_close(och, inode, &lease_broken);
2371                         if (rc < 0)
2372                                 RETURN(rc);
2373
2374                         if (lease_broken)
2375                                 fmode = 0;
2376
2377                         RETURN(ll_lease_type_from_fmode(fmode));
2378                 default:
2379                         RETURN(-EINVAL);
2380                 }
2381
2382                 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2383
2384                 /* apply for lease */
2385                 och = ll_lease_open(inode, file, fmode, 0);
2386                 if (IS_ERR(och))
2387                         RETURN(PTR_ERR(och));
2388
2389                 rc = 0;
2390                 mutex_lock(&lli->lli_och_mutex);
2391                 if (fd->fd_lease_och == NULL) {
2392                         fd->fd_lease_och = och;
2393                         och = NULL;
2394                 }
2395                 mutex_unlock(&lli->lli_och_mutex);
2396                 if (och != NULL) {
2397                         /* impossible now that only excl is supported for now */
2398                         ll_lease_close(och, inode, &lease_broken);
2399                         rc = -EBUSY;
2400                 }
2401                 RETURN(rc);
2402         }
2403         case LL_IOC_GET_LEASE: {
2404                 struct ll_inode_info *lli = ll_i2info(inode);
2405                 struct ldlm_lock *lock = NULL;
2406                 fmode_t fmode = 0;
2407
2408                 mutex_lock(&lli->lli_och_mutex);
2409                 if (fd->fd_lease_och != NULL) {
2410                         struct obd_client_handle *och = fd->fd_lease_och;
2411
2412                         lock = ldlm_handle2lock(&och->och_lease_handle);
2413                         if (lock != NULL) {
2414                                 lock_res_and_lock(lock);
2415                                 if (!ldlm_is_cancel(lock))
2416                                         fmode = och->och_flags;
2417
2418                                 unlock_res_and_lock(lock);
2419                                 LDLM_LOCK_PUT(lock);
2420                         }
2421                 }
2422                 mutex_unlock(&lli->lli_och_mutex);
2423
2424                 RETURN(ll_lease_type_from_fmode(fmode));
2425         }
2426         case LL_IOC_HSM_IMPORT: {
2427                 struct hsm_user_import *hui;
2428
2429                 OBD_ALLOC_PTR(hui);
2430                 if (hui == NULL)
2431                         RETURN(-ENOMEM);
2432
2433                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2434                         OBD_FREE_PTR(hui);
2435                         RETURN(-EFAULT);
2436                 }
2437
2438                 rc = ll_hsm_import(inode, file, hui);
2439
2440                 OBD_FREE_PTR(hui);
2441                 RETURN(rc);
2442         }
2443
2444         default: {
2445                 int err;
2446
2447                 if (LLIOC_STOP ==
2448                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2449                         RETURN(err);
2450
2451                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2452                                      (void __user *)arg));
2453         }
2454         }
2455 }
2456
2457 #ifndef HAVE_FILE_LLSEEK_SIZE
2458 static inline loff_t
2459 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2460 {
2461         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2462                 return -EINVAL;
2463         if (offset > maxsize)
2464                 return -EINVAL;
2465
2466         if (offset != file->f_pos) {
2467                 file->f_pos = offset;
2468                 file->f_version = 0;
2469         }
2470         return offset;
2471 }
2472
2473 static loff_t
2474 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2475                 loff_t maxsize, loff_t eof)
2476 {
2477         struct inode *inode = file->f_path.dentry->d_inode;
2478
2479         switch (origin) {
2480         case SEEK_END:
2481                 offset += eof;
2482                 break;
2483         case SEEK_CUR:
2484                 /*
2485                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2486                  * position-querying operation.  Avoid rewriting the "same"
2487                  * f_pos value back to the file because a concurrent read(),
2488                  * write() or lseek() might have altered it
2489                  */
2490                 if (offset == 0)
2491                         return file->f_pos;
2492                 /*
2493                  * f_lock protects against read/modify/write race with other
2494                  * SEEK_CURs. Note that parallel writes and reads behave
2495                  * like SEEK_SET.
2496                  */
2497                 mutex_lock(&inode->i_mutex);
2498                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2499                 mutex_unlock(&inode->i_mutex);
2500                 return offset;
2501         case SEEK_DATA:
2502                 /*
2503                  * In the generic case the entire file is data, so as long as
2504                  * offset isn't at the end of the file then the offset is data.
2505                  */
2506                 if (offset >= eof)
2507                         return -ENXIO;
2508                 break;
2509         case SEEK_HOLE:
2510                 /*
2511                  * There is a virtual hole at the end of the file, so as long as
2512                  * offset isn't i_size or larger, return i_size.
2513                  */
2514                 if (offset >= eof)
2515                         return -ENXIO;
2516                 offset = eof;
2517                 break;
2518         }
2519
2520         return llseek_execute(file, offset, maxsize);
2521 }
2522 #endif
2523
2524 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2525 {
2526         struct inode *inode = file->f_path.dentry->d_inode;
2527         loff_t retval, eof = 0;
2528
2529         ENTRY;
2530         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2531                            (origin == SEEK_CUR) ? file->f_pos : 0);
2532         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2533                PFID(ll_inode2fid(inode)), inode, retval, retval,
2534                origin);
2535         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2536
2537         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2538                 retval = ll_glimpse_size(inode);
2539                 if (retval != 0)
2540                         RETURN(retval);
2541                 eof = i_size_read(inode);
2542         }
2543
2544         retval = ll_generic_file_llseek_size(file, offset, origin,
2545                                           ll_file_maxbytes(inode), eof);
2546         RETURN(retval);
2547 }
2548
2549 static int ll_flush(struct file *file, fl_owner_t id)
2550 {
2551         struct inode *inode = file->f_path.dentry->d_inode;
2552         struct ll_inode_info *lli = ll_i2info(inode);
2553         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2554         int rc, err;
2555
2556         LASSERT(!S_ISDIR(inode->i_mode));
2557
2558         /* catch async errors that were recorded back when async writeback
2559          * failed for pages in this mapping. */
2560         rc = lli->lli_async_rc;
2561         lli->lli_async_rc = 0;
2562         if (lli->lli_clob != NULL) {
2563                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2564                 if (rc == 0)
2565                         rc = err;
2566         }
2567
2568         /* The application has been told write failure already.
2569          * Do not report failure again. */
2570         if (fd->fd_write_failed)
2571                 return 0;
2572         return rc ? -EIO : 0;
2573 }
2574
2575 /**
2576  * Called to make sure a portion of file has been written out.
2577  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2578  *
2579  * Return how many pages have been written.
2580  */
2581 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2582                        enum cl_fsync_mode mode, int ignore_layout)
2583 {
2584         struct cl_env_nest nest;
2585         struct lu_env *env;
2586         struct cl_io *io;
2587         struct cl_fsync_io *fio;
2588         int result;
2589         ENTRY;
2590
2591         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2592             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2593                 RETURN(-EINVAL);
2594
2595         env = cl_env_nested_get(&nest);
2596         if (IS_ERR(env))
2597                 RETURN(PTR_ERR(env));
2598
2599         io = vvp_env_thread_io(env);
2600         io->ci_obj = ll_i2info(inode)->lli_clob;
2601         io->ci_ignore_layout = ignore_layout;
2602
2603         /* initialize parameters for sync */
2604         fio = &io->u.ci_fsync;
2605         fio->fi_start = start;
2606         fio->fi_end = end;
2607         fio->fi_fid = ll_inode2fid(inode);
2608         fio->fi_mode = mode;
2609         fio->fi_nr_written = 0;
2610
2611         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2612                 result = cl_io_loop(env, io);
2613         else
2614                 result = io->ci_result;
2615         if (result == 0)
2616                 result = fio->fi_nr_written;
2617         cl_io_fini(env, io);
2618         cl_env_nested_put(&nest, env);
2619
2620         RETURN(result);
2621 }
2622
2623 /*
2624  * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2625  * null and dentry must be used directly rather than pulled from
2626  * *file->f_path.dentry as is done otherwise.
2627  */
2628
2629 #ifdef HAVE_FILE_FSYNC_4ARGS
2630 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2631 {
2632         struct dentry *dentry = file->f_path.dentry;
2633 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2634 int ll_fsync(struct file *file, int datasync)
2635 {
2636         struct dentry *dentry = file->f_path.dentry;
2637         loff_t start = 0;
2638         loff_t end = LLONG_MAX;
2639 #else
2640 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2641 {
2642         loff_t start = 0;
2643         loff_t end = LLONG_MAX;
2644 #endif
2645         struct inode *inode = dentry->d_inode;
2646         struct ll_inode_info *lli = ll_i2info(inode);
2647         struct ptlrpc_request *req;
2648         int rc, err;
2649         ENTRY;
2650
2651         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2652                PFID(ll_inode2fid(inode)), inode);
2653         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2654
2655 #ifdef HAVE_FILE_FSYNC_4ARGS
2656         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2657         mutex_lock(&inode->i_mutex);
2658 #else
2659         /* fsync's caller has already called _fdata{sync,write}, we want
2660          * that IO to finish before calling the osc and mdc sync methods */
2661         rc = filemap_fdatawait(inode->i_mapping);
2662 #endif
2663
2664         /* catch async errors that were recorded back when async writeback
2665          * failed for pages in this mapping. */
2666         if (!S_ISDIR(inode->i_mode)) {
2667                 err = lli->lli_async_rc;
2668                 lli->lli_async_rc = 0;
2669                 if (rc == 0)
2670                         rc = err;
2671                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2672                 if (rc == 0)
2673                         rc = err;
2674         }
2675
2676         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2677         if (!rc)
2678                 rc = err;
2679         if (!err)
2680                 ptlrpc_req_finished(req);
2681
2682         if (S_ISREG(inode->i_mode)) {
2683                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2684
2685                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2686                 if (rc == 0 && err < 0)
2687                         rc = err;
2688                 if (rc < 0)
2689                         fd->fd_write_failed = true;
2690                 else
2691                         fd->fd_write_failed = false;
2692         }
2693
2694 #ifdef HAVE_FILE_FSYNC_4ARGS
2695         mutex_unlock(&inode->i_mutex);
2696 #endif
2697         RETURN(rc);
2698 }
2699
2700 static int
2701 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2702 {
2703         struct inode *inode = file->f_path.dentry->d_inode;
2704         struct ll_sb_info *sbi = ll_i2sbi(inode);
2705         struct ldlm_enqueue_info einfo = {
2706                 .ei_type        = LDLM_FLOCK,
2707                 .ei_cb_cp       = ldlm_flock_completion_ast,
2708                 .ei_cbdata      = file_lock,
2709         };
2710         struct md_op_data *op_data;
2711         struct lustre_handle lockh = {0};
2712         ldlm_policy_data_t flock = {{0}};
2713         int fl_type = file_lock->fl_type;
2714         __u64 flags = 0;
2715         int rc;
2716         int rc2 = 0;
2717         ENTRY;
2718
2719         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2720                PFID(ll_inode2fid(inode)), file_lock);
2721
2722         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2723
2724         if (file_lock->fl_flags & FL_FLOCK) {
2725                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2726                 /* flocks are whole-file locks */
2727                 flock.l_flock.end = OFFSET_MAX;
2728                 /* For flocks owner is determined by the local file desctiptor*/
2729                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2730         } else if (file_lock->fl_flags & FL_POSIX) {
2731                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2732                 flock.l_flock.start = file_lock->fl_start;
2733                 flock.l_flock.end = file_lock->fl_end;
2734         } else {
2735                 RETURN(-EINVAL);
2736         }
2737         flock.l_flock.pid = file_lock->fl_pid;
2738
2739         /* Somewhat ugly workaround for svc lockd.
2740          * lockd installs custom fl_lmops->lm_compare_owner that checks
2741          * for the fl_owner to be the same (which it always is on local node
2742          * I guess between lockd processes) and then compares pid.
2743          * As such we assign pid to the owner field to make it all work,
2744          * conflict with normal locks is unlikely since pid space and
2745          * pointer space for current->files are not intersecting */
2746         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2747                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2748
2749         switch (fl_type) {
2750         case F_RDLCK:
2751                 einfo.ei_mode = LCK_PR;
2752                 break;
2753         case F_UNLCK:
2754                 /* An unlock request may or may not have any relation to
2755                  * existing locks so we may not be able to pass a lock handle
2756                  * via a normal ldlm_lock_cancel() request. The request may even
2757                  * unlock a byte range in the middle of an existing lock. In
2758                  * order to process an unlock request we need all of the same
2759                  * information that is given with a normal read or write record
2760                  * lock request. To avoid creating another ldlm unlock (cancel)
2761                  * message we'll treat a LCK_NL flock request as an unlock. */
2762                 einfo.ei_mode = LCK_NL;
2763                 break;
2764         case F_WRLCK:
2765                 einfo.ei_mode = LCK_PW;
2766                 break;
2767         default:
2768                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2769                 RETURN (-ENOTSUPP);
2770         }
2771
2772         switch (cmd) {
2773         case F_SETLKW:
2774 #ifdef F_SETLKW64
2775         case F_SETLKW64:
2776 #endif
2777                 flags = 0;
2778                 break;
2779         case F_SETLK:
2780 #ifdef F_SETLK64
2781         case F_SETLK64:
2782 #endif
2783                 flags = LDLM_FL_BLOCK_NOWAIT;
2784                 break;
2785         case F_GETLK:
2786 #ifdef F_GETLK64
2787         case F_GETLK64:
2788 #endif
2789                 flags = LDLM_FL_TEST_LOCK;
2790                 break;
2791         default:
2792                 CERROR("unknown fcntl lock command: %d\n", cmd);
2793                 RETURN (-EINVAL);
2794         }
2795
2796         /* Save the old mode so that if the mode in the lock changes we
2797          * can decrement the appropriate reader or writer refcount. */
2798         file_lock->fl_type = einfo.ei_mode;
2799
2800         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2801                                      LUSTRE_OPC_ANY, NULL);
2802         if (IS_ERR(op_data))
2803                 RETURN(PTR_ERR(op_data));
2804
2805         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2806                "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2807                flock.l_flock.pid, flags, einfo.ei_mode,
2808                flock.l_flock.start, flock.l_flock.end);
2809
2810         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2811                         flags);
2812
2813         /* Restore the file lock type if not TEST lock. */
2814         if (!(flags & LDLM_FL_TEST_LOCK))
2815                 file_lock->fl_type = fl_type;
2816
2817         if ((file_lock->fl_flags & FL_FLOCK) &&
2818             (rc == 0 || file_lock->fl_type == F_UNLCK))
2819                 rc2  = flock_lock_file_wait(file, file_lock);
2820         if ((file_lock->fl_flags & FL_POSIX) &&
2821             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2822             !(flags & LDLM_FL_TEST_LOCK))
2823                 rc2  = posix_lock_file_wait(file, file_lock);
2824
2825         if (rc2 && file_lock->fl_type != F_UNLCK) {
2826                 einfo.ei_mode = LCK_NL;
2827                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2828                            &lockh, flags);
2829                 rc = rc2;
2830         }
2831
2832         ll_finish_md_op_data(op_data);
2833
2834         RETURN(rc);
2835 }
2836
2837 int ll_get_fid_by_name(struct inode *parent, const char *name,
2838                        int namelen, struct lu_fid *fid)
2839 {
2840         struct md_op_data       *op_data = NULL;
2841         struct mdt_body         *body;
2842         struct ptlrpc_request   *req;
2843         int                     rc;
2844         ENTRY;
2845
2846         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2847                                      LUSTRE_OPC_ANY, NULL);
2848         if (IS_ERR(op_data))
2849                 RETURN(PTR_ERR(op_data));
2850
2851         op_data->op_valid = OBD_MD_FLID;
2852         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2853         ll_finish_md_op_data(op_data);
2854         if (rc < 0)
2855                 RETURN(rc);
2856
2857         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2858         if (body == NULL)
2859                 GOTO(out_req, rc = -EFAULT);
2860         if (fid != NULL)
2861                 *fid = body->mbo_fid1;
2862 out_req:
2863         ptlrpc_req_finished(req);
2864         RETURN(rc);
2865 }
2866
2867 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2868                const char *name, int namelen)
2869 {
2870         struct dentry         *dchild = NULL;
2871         struct inode          *child_inode = NULL;
2872         struct md_op_data     *op_data;
2873         struct ptlrpc_request *request = NULL;
2874         struct qstr           qstr;
2875         int                    rc;
2876         ENTRY;
2877
2878         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2879                name, PFID(ll_inode2fid(parent)), mdtidx);
2880
2881         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2882                                      0, LUSTRE_OPC_ANY, NULL);
2883         if (IS_ERR(op_data))
2884                 RETURN(PTR_ERR(op_data));
2885
2886         /* Get child FID first */
2887         qstr.hash = full_name_hash(name, namelen);
2888         qstr.name = name;
2889         qstr.len = namelen;
2890         dchild = d_lookup(file->f_path.dentry, &qstr);
2891         if (dchild != NULL) {
2892                 if (dchild->d_inode != NULL) {
2893                         child_inode = igrab(dchild->d_inode);
2894                         if (child_inode != NULL) {
2895                                 mutex_lock(&child_inode->i_mutex);
2896                                 op_data->op_fid3 = *ll_inode2fid(child_inode);
2897                                 ll_invalidate_aliases(child_inode);
2898                         }
2899                 }
2900                 dput(dchild);
2901         } else {
2902                 rc = ll_get_fid_by_name(parent, name, namelen,
2903                                         &op_data->op_fid3);
2904                 if (rc != 0)
2905                         GOTO(out_free, rc);
2906         }
2907
2908         if (!fid_is_sane(&op_data->op_fid3)) {
2909                 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2910                        ll_get_fsname(parent->i_sb, NULL, 0), name,
2911                        PFID(&op_data->op_fid3));
2912                 GOTO(out_free, rc = -EINVAL);
2913         }
2914
2915         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2916         if (rc < 0)
2917                 GOTO(out_free, rc);
2918
2919         if (rc == mdtidx) {
2920                 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2921                        PFID(&op_data->op_fid3), mdtidx);
2922                 GOTO(out_free, rc = 0);
2923         }
2924
2925         op_data->op_mds = mdtidx;
2926         op_data->op_cli_flags = CLI_MIGRATE;
2927         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2928                        namelen, name, namelen, &request);
2929         if (rc == 0)
2930                 ll_update_times(request, parent);
2931
2932         ptlrpc_req_finished(request);
2933         if (rc != 0)
2934                 GOTO(out_free, rc);
2935
2936 out_free:
2937         if (child_inode != NULL) {
2938                 clear_nlink(child_inode);
2939                 mutex_unlock(&child_inode->i_mutex);
2940                 iput(child_inode);
2941         }
2942
2943         ll_finish_md_op_data(op_data);
2944         RETURN(rc);
2945 }
2946
2947 static int
2948 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2949 {
2950         ENTRY;
2951
2952         RETURN(-ENOSYS);
2953 }
2954
2955 /**
2956  * test if some locks matching bits and l_req_mode are acquired
2957  * - bits can be in different locks
2958  * - if found clear the common lock bits in *bits
2959  * - the bits not found, are kept in *bits
2960  * \param inode [IN]
2961  * \param bits [IN] searched lock bits [IN]
2962  * \param l_req_mode [IN] searched lock mode
2963  * \retval boolean, true iff all bits are found
2964  */
2965 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2966 {
2967         struct lustre_handle lockh;
2968         ldlm_policy_data_t policy;
2969         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2970                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2971         struct lu_fid *fid;
2972         __u64 flags;
2973         int i;
2974         ENTRY;
2975
2976         if (!inode)
2977                RETURN(0);
2978
2979         fid = &ll_i2info(inode)->lli_fid;
2980         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2981                ldlm_lockname[mode]);
2982
2983         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2984         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2985                 policy.l_inodebits.bits = *bits & (1 << i);
2986                 if (policy.l_inodebits.bits == 0)
2987                         continue;
2988
2989                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2990                                   &policy, mode, &lockh)) {
2991                         struct ldlm_lock *lock;
2992
2993                         lock = ldlm_handle2lock(&lockh);
2994                         if (lock) {
2995                                 *bits &=
2996                                       ~(lock->l_policy_data.l_inodebits.bits);
2997                                 LDLM_LOCK_PUT(lock);
2998                         } else {
2999                                 *bits &= ~policy.l_inodebits.bits;
3000                         }
3001                 }
3002         }
3003         RETURN(*bits == 0);
3004 }
3005
3006 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3007                             struct lustre_handle *lockh, __u64 flags,
3008                             ldlm_mode_t mode)
3009 {
3010         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3011         struct lu_fid *fid;
3012         ldlm_mode_t rc;
3013         ENTRY;
3014
3015         fid = &ll_i2info(inode)->lli_fid;
3016         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3017
3018         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3019                            fid, LDLM_IBITS, &policy, mode, lockh);
3020
3021         RETURN(rc);
3022 }
3023
3024 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3025 {
3026         /* Already unlinked. Just update nlink and return success */
3027         if (rc == -ENOENT) {
3028                 clear_nlink(inode);
3029                 /* This path cannot be hit for regular files unless in
3030                  * case of obscure races, so no need to to validate
3031                  * size. */
3032                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3033                         return 0;
3034         } else if (rc != 0) {
3035                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3036                              "%s: revalidate FID "DFID" error: rc = %d\n",
3037                              ll_get_fsname(inode->i_sb, NULL, 0),
3038                              PFID(ll_inode2fid(inode)), rc);
3039         }
3040
3041         return rc;
3042 }
3043
3044 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3045 {
3046         struct inode *inode = dentry->d_inode;
3047         struct ptlrpc_request *req = NULL;
3048         struct obd_export *exp;
3049         int rc = 0;
3050         ENTRY;
3051
3052         LASSERT(inode != NULL);
3053
3054         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3055                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3056
3057         exp = ll_i2mdexp(inode);
3058
3059         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3060          *      But under CMD case, it caused some lock issues, should be fixed
3061          *      with new CMD ibits lock. See bug 12718 */
3062         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3063                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3064                 struct md_op_data *op_data;
3065
3066                 if (ibits == MDS_INODELOCK_LOOKUP)
3067                         oit.it_op = IT_LOOKUP;
3068
3069                 /* Call getattr by fid, so do not provide name at all. */
3070                 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3071                                              dentry->d_inode, NULL, 0, 0,
3072                                              LUSTRE_OPC_ANY, NULL);
3073                 if (IS_ERR(op_data))
3074                         RETURN(PTR_ERR(op_data));
3075
3076                 rc = md_intent_lock(exp, op_data, &oit, &req,
3077                                     &ll_md_blocking_ast, 0);
3078                 ll_finish_md_op_data(op_data);
3079                 if (rc < 0) {
3080                         rc = ll_inode_revalidate_fini(inode, rc);
3081                         GOTO (out, rc);
3082                 }
3083
3084                 rc = ll_revalidate_it_finish(req, &oit, dentry);
3085                 if (rc != 0) {
3086                         ll_intent_release(&oit);
3087                         GOTO(out, rc);
3088                 }
3089
3090                 /* Unlinked? Unhash dentry, so it is not picked up later by
3091                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3092                    here to preserve get_cwd functionality on 2.6.
3093                    Bug 10503 */
3094                 if (!dentry->d_inode->i_nlink)
3095                         d_lustre_invalidate(dentry, 0);
3096
3097                 ll_lookup_finish_locks(&oit, dentry);
3098         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3099                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3100                 u64 valid = OBD_MD_FLGETATTR;
3101                 struct md_op_data *op_data;
3102                 int ealen = 0;
3103
3104                 if (S_ISREG(inode->i_mode)) {
3105                         rc = ll_get_default_mdsize(sbi, &ealen);
3106                         if (rc)
3107                                 RETURN(rc);
3108                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3109                 }
3110
3111                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3112                                              0, ealen, LUSTRE_OPC_ANY,
3113                                              NULL);
3114                 if (IS_ERR(op_data))
3115                         RETURN(PTR_ERR(op_data));
3116
3117                 op_data->op_valid = valid;
3118                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3119                 ll_finish_md_op_data(op_data);
3120                 if (rc) {
3121                         rc = ll_inode_revalidate_fini(inode, rc);
3122                         RETURN(rc);
3123                 }
3124
3125                 rc = ll_prep_inode(&inode, req, NULL, NULL);
3126         }
3127 out:
3128         ptlrpc_req_finished(req);
3129         return rc;
3130 }
3131
3132 static int ll_merge_md_attr(struct inode *inode)
3133 {
3134         struct cl_attr attr = { 0 };
3135         int rc;
3136
3137         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3138         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3139                            &attr, ll_md_blocking_ast);
3140         if (rc != 0)
3141                 RETURN(rc);
3142
3143         set_nlink(inode, attr.cat_nlink);
3144         inode->i_blocks = attr.cat_blocks;
3145         i_size_write(inode, attr.cat_size);
3146
3147         ll_i2info(inode)->lli_atime = attr.cat_atime;
3148         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3149         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3150
3151         RETURN(0);
3152 }
3153
3154 static int
3155 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3156 {
3157         struct inode    *inode = dentry->d_inode;
3158         int              rc;
3159         ENTRY;
3160
3161         rc = __ll_inode_revalidate(dentry, ibits);
3162         if (rc != 0)
3163                 RETURN(rc);
3164
3165         /* if object isn't regular file, don't validate size */
3166         if (!S_ISREG(inode->i_mode)) {
3167                 if (S_ISDIR(inode->i_mode) &&
3168                     ll_i2info(inode)->lli_lsm_md != NULL) {
3169                         rc = ll_merge_md_attr(inode);
3170                         if (rc != 0)
3171                                 RETURN(rc);
3172                 }
3173
3174                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3175                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3176                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3177         } else {
3178                 /* In case of restore, the MDT has the right size and has
3179                  * already send it back without granting the layout lock,
3180                  * inode is up-to-date so glimpse is useless.
3181                  * Also to glimpse we need the layout, in case of a running
3182                  * restore the MDT holds the layout lock so the glimpse will
3183                  * block up to the end of restore (getattr will block)
3184                  */
3185                 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3186                         rc = ll_glimpse_size(inode);
3187         }
3188         RETURN(rc);
3189 }
3190
3191 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3192 {
3193         struct inode *inode = de->d_inode;
3194         struct ll_sb_info *sbi = ll_i2sbi(inode);
3195         struct ll_inode_info *lli = ll_i2info(inode);
3196         int res = 0;
3197
3198         res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3199                                       MDS_INODELOCK_LOOKUP);
3200         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3201
3202         if (res)
3203                 return res;
3204
3205         stat->dev = inode->i_sb->s_dev;
3206         if (ll_need_32bit_api(sbi))
3207                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3208         else
3209                 stat->ino = inode->i_ino;
3210         stat->mode = inode->i_mode;
3211         stat->uid = inode->i_uid;
3212         stat->gid = inode->i_gid;
3213         stat->rdev = inode->i_rdev;
3214         stat->atime = inode->i_atime;
3215         stat->mtime = inode->i_mtime;
3216         stat->ctime = inode->i_ctime;
3217         stat->blksize = 1 << inode->i_blkbits;
3218
3219         stat->nlink = inode->i_nlink;
3220         stat->size = i_size_read(inode);
3221         stat->blocks = inode->i_blocks;
3222
3223         return 0;
3224 }
3225
3226 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3227                      __u64 start, __u64 len)
3228 {
3229         int             rc;
3230         size_t          num_bytes;
3231         struct fiemap   *fiemap;
3232         unsigned int    extent_count = fieinfo->fi_extents_max;
3233
3234         num_bytes = sizeof(*fiemap) + (extent_count *
3235                                        sizeof(struct fiemap_extent));
3236         OBD_ALLOC_LARGE(fiemap, num_bytes);
3237
3238         if (fiemap == NULL)
3239                 RETURN(-ENOMEM);
3240
3241         fiemap->fm_flags = fieinfo->fi_flags;
3242         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3243         fiemap->fm_start = start;
3244         fiemap->fm_length = len;
3245         if (extent_count > 0 &&
3246             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3247                            sizeof(struct fiemap_extent)) != 0)
3248                 GOTO(out, rc = -EFAULT);
3249
3250         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3251
3252         fieinfo->fi_flags = fiemap->fm_flags;
3253         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3254         if (extent_count > 0 &&
3255             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3256                          fiemap->fm_mapped_extents *
3257                          sizeof(struct fiemap_extent)) != 0)
3258                 GOTO(out, rc = -EFAULT);
3259 out:
3260         OBD_FREE_LARGE(fiemap, num_bytes);
3261         return rc;
3262 }
3263
3264 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3265 {
3266         struct ll_inode_info *lli = ll_i2info(inode);
3267         struct posix_acl *acl = NULL;
3268         ENTRY;
3269
3270         spin_lock(&lli->lli_lock);
3271         /* VFS' acl_permission_check->check_acl will release the refcount */
3272         acl = posix_acl_dup(lli->lli_posix_acl);
3273         spin_unlock(&lli->lli_lock);
3274
3275         RETURN(acl);
3276 }
3277
3278 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3279 static int
3280 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3281 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3282 # else
3283 ll_check_acl(struct inode *inode, int mask)
3284 # endif
3285 {
3286 # ifdef CONFIG_FS_POSIX_ACL
3287         struct posix_acl *acl;
3288         int rc;
3289         ENTRY;
3290
3291 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
3292         if (flags & IPERM_FLAG_RCU)
3293                 return -ECHILD;
3294 #  endif
3295         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3296
3297         if (!acl)
3298                 RETURN(-EAGAIN);
3299
3300         rc = posix_acl_permission(inode, acl, mask);
3301         posix_acl_release(acl);
3302
3303         RETURN(rc);
3304 # else /* !CONFIG_FS_POSIX_ACL */
3305         return -EAGAIN;
3306 # endif /* CONFIG_FS_POSIX_ACL */
3307 }
3308 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3309
3310 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3311 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3312 #else
3313 # ifdef HAVE_INODE_PERMISION_2ARGS
3314 int ll_inode_permission(struct inode *inode, int mask)
3315 # else
3316 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3317 # endif
3318 #endif
3319 {
3320         int rc = 0;
3321         struct ll_sb_info *sbi;
3322         struct root_squash_info *squash;
3323         struct cred *cred = NULL;
3324         const struct cred *old_cred = NULL;
3325         cfs_cap_t cap;
3326         bool squash_id = false;
3327         ENTRY;
3328
3329 #ifdef MAY_NOT_BLOCK
3330         if (mask & MAY_NOT_BLOCK)
3331                 return -ECHILD;
3332 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3333         if (flags & IPERM_FLAG_RCU)
3334                 return -ECHILD;
3335 #endif
3336
3337        /* as root inode are NOT getting validated in lookup operation,
3338         * need to do it before permission check. */
3339
3340         if (inode == inode->i_sb->s_root->d_inode) {
3341                 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3342                                            MDS_INODELOCK_LOOKUP);
3343                 if (rc)
3344                         RETURN(rc);
3345         }
3346
3347         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3348                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3349
3350         /* squash fsuid/fsgid if needed */
3351         sbi = ll_i2sbi(inode);
3352         squash = &sbi->ll_squash;
3353         if (unlikely(squash->rsi_uid != 0 &&
3354                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3355                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3356                         squash_id = true;
3357         }
3358         if (squash_id) {
3359                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3360                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3361                        squash->rsi_uid, squash->rsi_gid);
3362
3363                 /* update current process's credentials
3364                  * and FS capability */
3365                 cred = prepare_creds();
3366                 if (cred == NULL)
3367                         RETURN(-ENOMEM);
3368
3369                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3370                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3371                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3372                         if ((1 << cap) & CFS_CAP_FS_MASK)
3373                                 cap_lower(cred->cap_effective, cap);
3374                 }
3375                 old_cred = override_creds(cred);
3376         }
3377
3378         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3379
3380         if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3381                 rc = lustre_check_remote_perm(inode, mask);
3382         else
3383                 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3384
3385         /* restore current process's credentials and FS capability */
3386         if (squash_id) {
3387                 revert_creds(old_cred);
3388                 put_cred(cred);
3389         }
3390
3391         RETURN(rc);
3392 }
3393
3394 /* -o localflock - only provides locally consistent flock locks */
3395 struct file_operations ll_file_operations = {
3396         .read           = ll_file_read,
3397         .aio_read    = ll_file_aio_read,
3398         .write          = ll_file_write,
3399         .aio_write   = ll_file_aio_write,
3400         .unlocked_ioctl = ll_file_ioctl,
3401         .open           = ll_file_open,
3402         .release        = ll_file_release,
3403         .mmap           = ll_file_mmap,
3404         .llseek         = ll_file_seek,
3405         .splice_read    = ll_file_splice_read,
3406         .fsync          = ll_fsync,
3407         .flush          = ll_flush
3408 };
3409
3410 struct file_operations ll_file_operations_flock = {
3411         .read           = ll_file_read,
3412         .aio_read    = ll_file_aio_read,
3413         .write          = ll_file_write,
3414         .aio_write   = ll_file_aio_write,
3415         .unlocked_ioctl = ll_file_ioctl,
3416         .open           = ll_file_open,
3417         .release        = ll_file_release,
3418         .mmap           = ll_file_mmap,
3419         .llseek         = ll_file_seek,
3420         .splice_read    = ll_file_splice_read,
3421         .fsync          = ll_fsync,
3422         .flush          = ll_flush,
3423         .flock          = ll_file_flock,
3424         .lock           = ll_file_flock
3425 };
3426
3427 /* These are for -o noflock - to return ENOSYS on flock calls */
3428 struct file_operations ll_file_operations_noflock = {
3429         .read           = ll_file_read,
3430         .aio_read    = ll_file_aio_read,
3431         .write          = ll_file_write,
3432         .aio_write   = ll_file_aio_write,
3433         .unlocked_ioctl = ll_file_ioctl,
3434         .open           = ll_file_open,
3435         .release        = ll_file_release,
3436         .mmap           = ll_file_mmap,
3437         .llseek         = ll_file_seek,
3438         .splice_read    = ll_file_splice_read,
3439         .fsync          = ll_fsync,
3440         .flush          = ll_flush,
3441         .flock          = ll_file_noflock,
3442         .lock           = ll_file_noflock
3443 };
3444
3445 struct inode_operations ll_file_inode_operations = {
3446         .setattr        = ll_setattr,
3447         .getattr        = ll_getattr,
3448         .permission     = ll_inode_permission,
3449         .setxattr       = ll_setxattr,
3450         .getxattr       = ll_getxattr,
3451         .listxattr      = ll_listxattr,
3452         .removexattr    = ll_removexattr,
3453         .fiemap         = ll_fiemap,
3454 #ifdef HAVE_IOP_GET_ACL
3455         .get_acl        = ll_get_acl,
3456 #endif
3457 };
3458
3459 /* dynamic ioctl number support routins */
3460 static struct llioc_ctl_data {
3461         struct rw_semaphore     ioc_sem;
3462         struct list_head        ioc_head;
3463 } llioc = {
3464         __RWSEM_INITIALIZER(llioc.ioc_sem),
3465         LIST_HEAD_INIT(llioc.ioc_head)
3466 };
3467
3468
3469 struct llioc_data {
3470         struct list_head        iocd_list;
3471         unsigned int            iocd_size;
3472         llioc_callback_t        iocd_cb;
3473         unsigned int            iocd_count;
3474         unsigned int            iocd_cmd[0];
3475 };
3476
3477 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3478 {
3479         unsigned int size;
3480         struct llioc_data *in_data = NULL;
3481         ENTRY;
3482
3483         if (cb == NULL || cmd == NULL ||
3484             count > LLIOC_MAX_CMD || count < 0)
3485                 RETURN(NULL);
3486
3487         size = sizeof(*in_data) + count * sizeof(unsigned int);
3488         OBD_ALLOC(in_data, size);
3489         if (in_data == NULL)
3490                 RETURN(NULL);
3491
3492         memset(in_data, 0, sizeof(*in_data));
3493         in_data->iocd_size = size;
3494         in_data->iocd_cb = cb;
3495         in_data->iocd_count = count;
3496         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3497
3498         down_write(&llioc.ioc_sem);
3499         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3500         up_write(&llioc.ioc_sem);
3501
3502         RETURN(in_data);
3503 }
3504
3505 void ll_iocontrol_unregister(void *magic)
3506 {
3507         struct llioc_data *tmp;
3508
3509         if (magic == NULL)
3510                 return;
3511
3512         down_write(&llioc.ioc_sem);
3513         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3514                 if (tmp == magic) {
3515                         unsigned int size = tmp->iocd_size;
3516
3517                         list_del(&tmp->iocd_list);
3518                         up_write(&llioc.ioc_sem);
3519
3520                         OBD_FREE(tmp, size);
3521                         return;
3522                 }
3523         }
3524         up_write(&llioc.ioc_sem);
3525
3526         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3527 }
3528
3529 EXPORT_SYMBOL(ll_iocontrol_register);
3530 EXPORT_SYMBOL(ll_iocontrol_unregister);
3531
3532 static enum llioc_iter
3533 ll_iocontrol_call(struct inode *inode, struct file *file,
3534                   unsigned int cmd, unsigned long arg, int *rcp)
3535 {
3536         enum llioc_iter ret = LLIOC_CONT;
3537         struct llioc_data *data;
3538         int rc = -EINVAL, i;
3539
3540         down_read(&llioc.ioc_sem);
3541         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3542                 for (i = 0; i < data->iocd_count; i++) {
3543                         if (cmd != data->iocd_cmd[i])
3544                                 continue;
3545
3546                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3547                         break;
3548                 }
3549
3550                 if (ret == LLIOC_STOP)
3551                         break;
3552         }
3553         up_read(&llioc.ioc_sem);
3554
3555         if (rcp)
3556                 *rcp = rc;
3557         return ret;
3558 }
3559
3560 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3561 {
3562         struct ll_inode_info *lli = ll_i2info(inode);
3563         struct cl_object *obj = lli->lli_clob;
3564         struct cl_env_nest nest;
3565         struct lu_env *env;
3566         int rc;
3567         ENTRY;
3568
3569         if (obj == NULL)
3570                 RETURN(0);
3571
3572         env = cl_env_nested_get(&nest);
3573         if (IS_ERR(env))
3574                 RETURN(PTR_ERR(env));
3575
3576         rc = cl_conf_set(env, lli->lli_clob, conf);
3577         if (rc < 0)
3578                 GOTO(out, rc);
3579
3580         if (conf->coc_opc == OBJECT_CONF_SET) {
3581                 struct ldlm_lock *lock = conf->coc_lock;
3582                 struct cl_layout cl = {
3583                         .cl_layout_gen = 0,
3584                 };
3585
3586                 LASSERT(lock != NULL);
3587                 LASSERT(ldlm_has_layout(lock));
3588
3589                 /* it can only be allowed to match after layout is
3590                  * applied to inode otherwise false layout would be
3591                  * seen. Applying layout shoud happen before dropping
3592                  * the intent lock. */
3593                 ldlm_lock_allow_match(lock);
3594
3595                 rc = cl_object_layout_get(env, obj, &cl);
3596                 if (rc < 0)
3597                         GOTO(out, rc);
3598
3599                 CDEBUG(D_VFSTRACE,
3600                        DFID": layout version change: %u -> %u\n",
3601                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
3602                        cl.cl_layout_gen);
3603                 ll_layout_version_set(lli, cl.cl_layout_gen);
3604         }
3605
3606 out:
3607         cl_env_nested_put(&nest, env);
3608
3609         RETURN(rc);
3610 }
3611
3612 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3613 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3614
3615 {
3616         struct ll_sb_info *sbi = ll_i2sbi(inode);
3617         struct ptlrpc_request *req;
3618         struct mdt_body *body;
3619         void *lvbdata;
3620         void *lmm;
3621         int lmmsize;
3622         int rc;
3623         ENTRY;
3624
3625         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3626                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3627                lock->l_lvb_data, lock->l_lvb_len);
3628
3629         if (lock->l_lvb_data != NULL)
3630                 RETURN(0);
3631
3632         /* if layout lock was granted right away, the layout is returned
3633          * within DLM_LVB of dlm reply; otherwise if the lock was ever
3634          * blocked and then granted via completion ast, we have to fetch
3635          * layout here. Please note that we can't use the LVB buffer in
3636          * completion AST because it doesn't have a large enough buffer */
3637         rc = ll_get_default_mdsize(sbi, &lmmsize);
3638         if (rc == 0)
3639                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3640                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3641                                 lmmsize, 0, &req);
3642         if (rc < 0)
3643                 RETURN(rc);
3644
3645         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3646         if (body == NULL)
3647                 GOTO(out, rc = -EPROTO);
3648
3649         lmmsize = body->mbo_eadatasize;
3650         if (lmmsize == 0) /* empty layout */
3651                 GOTO(out, rc = 0);
3652
3653         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3654         if (lmm == NULL)
3655                 GOTO(out, rc = -EFAULT);
3656
3657         OBD_ALLOC_LARGE(lvbdata, lmmsize);
3658         if (lvbdata == NULL)
3659                 GOTO(out, rc = -ENOMEM);
3660
3661         memcpy(lvbdata, lmm, lmmsize);
3662         lock_res_and_lock(lock);
3663         if (unlikely(lock->l_lvb_data == NULL)) {
3664                 lock->l_lvb_type = LVB_T_LAYOUT;
3665                 lock->l_lvb_data = lvbdata;
3666                 lock->l_lvb_len = lmmsize;
3667                 lvbdata = NULL;
3668         }
3669         unlock_res_and_lock(lock);
3670
3671         if (lvbdata != NULL)
3672                 OBD_FREE_LARGE(lvbdata, lmmsize);
3673
3674         EXIT;
3675
3676 out:
3677         ptlrpc_req_finished(req);
3678         return rc;
3679 }
3680
3681 /**
3682  * Apply the layout to the inode. Layout lock is held and will be released
3683  * in this function.
3684  */
3685 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3686                               struct inode *inode)
3687 {
3688         struct ll_inode_info *lli = ll_i2info(inode);
3689         struct ll_sb_info    *sbi = ll_i2sbi(inode);
3690         struct ldlm_lock *lock;
3691         struct cl_object_conf conf;
3692         int rc = 0;
3693         bool lvb_ready;
3694         bool wait_layout = false;
3695         ENTRY;
3696
3697         LASSERT(lustre_handle_is_used(lockh));
3698
3699         lock = ldlm_handle2lock(lockh);
3700         LASSERT(lock != NULL);
3701         LASSERT(ldlm_has_layout(lock));
3702
3703         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3704                    PFID(&lli->lli_fid), inode);
3705
3706         /* in case this is a caching lock and reinstate with new inode */
3707         md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3708
3709         lock_res_and_lock(lock);
3710         lvb_ready = ldlm_is_lvb_ready(lock);
3711         unlock_res_and_lock(lock);
3712         /* checking lvb_ready is racy but this is okay. The worst case is
3713          * that multi processes may configure the file on the same time. */
3714
3715         if (lvb_ready)
3716                 GOTO(out, rc = 0);
3717
3718         rc = ll_layout_fetch(inode, lock);
3719         if (rc < 0)
3720                 GOTO(out, rc);
3721
3722         /* for layout lock, lmm is stored in lock's lvb.
3723          * lvb_data is immutable if the lock is held so it's safe to access it
3724          * without res lock.
3725          *
3726          * set layout to file. Unlikely this will fail as old layout was
3727          * surely eliminated */
3728         memset(&conf, 0, sizeof conf);
3729         conf.coc_opc = OBJECT_CONF_SET;
3730         conf.coc_inode = inode;
3731         conf.coc_lock = lock;
3732         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
3733         conf.u.coc_layout.lb_len = lock->l_lvb_len;
3734         rc = ll_layout_conf(inode, &conf);
3735
3736         /* refresh layout failed, need to wait */
3737         wait_layout = rc == -EBUSY;
3738         EXIT;
3739
3740 out:
3741         LDLM_LOCK_PUT(lock);
3742         ldlm_lock_decref(lockh, mode);
3743
3744         /* wait for IO to complete if it's still being used. */
3745         if (wait_layout) {
3746                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3747                        ll_get_fsname(inode->i_sb, NULL, 0),
3748                        PFID(&lli->lli_fid), inode);
3749
3750                 memset(&conf, 0, sizeof conf);
3751                 conf.coc_opc = OBJECT_CONF_WAIT;
3752                 conf.coc_inode = inode;
3753                 rc = ll_layout_conf(inode, &conf);
3754                 if (rc == 0)
3755                         rc = -EAGAIN;
3756
3757                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3758                        ll_get_fsname(inode->i_sb, NULL, 0),
3759                        PFID(&lli->lli_fid), rc);
3760         }
3761         RETURN(rc);
3762 }
3763
3764 static int ll_layout_refresh_locked(struct inode *inode)
3765 {
3766         struct ll_inode_info  *lli = ll_i2info(inode);
3767         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3768         struct md_op_data     *op_data;
3769         struct lookup_intent   it;
3770         struct lustre_handle   lockh;
3771         ldlm_mode_t            mode;
3772         struct ldlm_enqueue_info einfo = {
3773                 .ei_type = LDLM_IBITS,
3774                 .ei_mode = LCK_CR,
3775                 .ei_cb_bl = &ll_md_blocking_ast,
3776                 .ei_cb_cp = &ldlm_completion_ast,
3777         };
3778         int rc;
3779         ENTRY;
3780
3781 again:
3782         /* mostly layout lock is caching on the local side, so try to match
3783          * it before grabbing layout lock mutex. */
3784         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3785                                LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3786         if (mode != 0) { /* hit cached lock */
3787                 rc = ll_layout_lock_set(&lockh, mode, inode);
3788                 if (rc == -EAGAIN)
3789                         goto again;
3790
3791                 RETURN(rc);
3792         }
3793
3794         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3795                                      0, 0, LUSTRE_OPC_ANY, NULL);
3796         if (IS_ERR(op_data))
3797                 RETURN(PTR_ERR(op_data));
3798
3799         /* have to enqueue one */
3800         memset(&it, 0, sizeof(it));
3801         it.it_op = IT_LAYOUT;
3802         lockh.cookie = 0ULL;
3803
3804         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3805                           ll_get_fsname(inode->i_sb, NULL, 0),
3806                           PFID(&lli->lli_fid), inode);
3807
3808         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3809         if (it.d.lustre.it_data != NULL)
3810                 ptlrpc_req_finished(it.d.lustre.it_data);
3811         it.d.lustre.it_data = NULL;
3812
3813         ll_finish_md_op_data(op_data);
3814
3815         mode = it.d.lustre.it_lock_mode;
3816         it.d.lustre.it_lock_mode = 0;
3817         ll_intent_drop_lock(&it);
3818
3819         if (rc == 0) {
3820                 /* set lock data in case this is a new lock */
3821                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3822                 rc = ll_layout_lock_set(&lockh, mode, inode);
3823                 if (rc == -EAGAIN)
3824                         goto again;
3825         }
3826
3827         RETURN(rc);
3828 }
3829
3830 /**
3831  * This function checks if there exists a LAYOUT lock on the client side,
3832  * or enqueues it if it doesn't have one in cache.
3833  *
3834  * This function will not hold layout lock so it may be revoked any time after
3835  * this function returns. Any operations depend on layout should be redone
3836  * in that case.
3837  *
3838  * This function should be called before lov_io_init() to get an uptodate
3839  * layout version, the caller should save the version number and after IO
3840  * is finished, this function should be called again to verify that layout
3841  * is not changed during IO time.
3842  */
3843 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3844 {
3845         struct ll_inode_info    *lli = ll_i2info(inode);
3846         struct ll_sb_info       *sbi = ll_i2sbi(inode);
3847         int rc;
3848         ENTRY;
3849
3850         *gen = ll_layout_version_get(lli);
3851         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3852                 RETURN(0);
3853
3854         /* sanity checks */
3855         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3856         LASSERT(S_ISREG(inode->i_mode));
3857
3858         /* take layout lock mutex to enqueue layout lock exclusively. */
3859         mutex_lock(&lli->lli_layout_mutex);
3860
3861         rc = ll_layout_refresh_locked(inode);
3862         if (rc < 0)
3863                 GOTO(out, rc);
3864
3865         *gen = ll_layout_version_get(lli);
3866 out:
3867         mutex_unlock(&lli->lli_layout_mutex);
3868
3869         RETURN(rc);
3870 }
3871
3872 /**
3873  *  This function send a restore request to the MDT
3874  */
3875 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3876 {
3877         struct hsm_user_request *hur;
3878         int                      len, rc;
3879         ENTRY;
3880
3881         len = sizeof(struct hsm_user_request) +
3882               sizeof(struct hsm_user_item);
3883         OBD_ALLOC(hur, len);
3884         if (hur == NULL)
3885                 RETURN(-ENOMEM);
3886
3887         hur->hur_request.hr_action = HUA_RESTORE;
3888         hur->hur_request.hr_archive_id = 0;
3889         hur->hur_request.hr_flags = 0;
3890         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3891                sizeof(hur->hur_user_item[0].hui_fid));
3892         hur->hur_user_item[0].hui_extent.offset = offset;
3893         hur->hur_user_item[0].hui_extent.length = length;
3894         hur->hur_request.hr_itemcount = 1;
3895         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
3896                            len, hur, NULL);
3897         OBD_FREE(hur, len);
3898         RETURN(rc);
3899 }