lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2015, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <linux/pagemap.h>
  46 #include <linux/file.h>
  47 #include <linux/sched.h>
  48 #include <linux/user_namespace.h>
  49 #ifdef HAVE_UIDGID_HEADER
  50 # include <linux/uidgid.h>
  51 #endif
  52 #include <lustre/ll_fiemap.h>
  53
  54 #include <lustre_ioctl.h>
  55 #include <lustre_swab.h>
  56
  57 #include "cl_object.h"
  58 #include "llite_internal.h"
  59 #include "vvp_internal.h"
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static enum llioc_iter
  68 ll_iocontrol_call(struct inode *inode, struct file *file,
  69                   unsigned int cmd, unsigned long arg, int *rcp);
  70
  71 static struct ll_file_data *ll_file_data_get(void)
  72 {
  73         struct ll_file_data *fd;
  74
  75         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  76         if (fd == NULL)
  77                 return NULL;
  78
  79         fd->fd_write_failed = false;
  80
  81         return fd;
  82 }
  83
  84 static void ll_file_data_put(struct ll_file_data *fd)
  85 {
  86         if (fd != NULL)
  87                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  88 }
  89
  90 /**
  91  * Packs all the attributes into @op_data for the CLOSE rpc.
  92  */
  93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                              struct obd_client_handle *och)
  95 {
  96         ENTRY;
  97
  98         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  99                            0, 0, LUSTRE_OPC_ANY, NULL);
 100
 101         op_data->op_attr.ia_mode = inode->i_mode;
 102         op_data->op_attr.ia_atime = inode->i_atime;
 103         op_data->op_attr.ia_mtime = inode->i_mtime;
 104         op_data->op_attr.ia_ctime = inode->i_ctime;
 105         op_data->op_attr.ia_size = i_size_read(inode);
 106         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 107                                      ATTR_MTIME | ATTR_MTIME_SET |
 108                                      ATTR_CTIME | ATTR_CTIME_SET;
 109         op_data->op_attr_blocks = inode->i_blocks;
 110         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 111         op_data->op_handle = och->och_fh;
 112
 113         if (och->och_flags & FMODE_WRITE &&
 114             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 115                 /* For HSM: if inode data has been modified, pack it so that
 116                  * MDT can set data dirty flag in the archive. */
 117                 op_data->op_bias |= MDS_DATA_MODIFIED;
 118
 119         EXIT;
 120 }
 121
 122 /**
 123  * Perform a close, possibly with a bias.
 124  * The meaning of "data" depends on the value of "bias".
 125  *
 126  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 127  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 128  * swap layouts with.
 129  */
 130 static int ll_close_inode_openhandle(struct inode *inode,
 131                                      struct obd_client_handle *och,
 132                                      enum mds_op_bias bias, void *data)
 133 {
 134         struct obd_export *md_exp = ll_i2mdexp(inode);
 135         const struct ll_inode_info *lli = ll_i2info(inode);
 136         struct md_op_data *op_data;
 137         struct ptlrpc_request *req = NULL;
 138         int rc;
 139         ENTRY;
 140
 141         if (class_exp2obd(md_exp) == NULL) {
 142                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 143                        ll_get_fsname(inode->i_sb, NULL, 0),
 144                        PFID(&lli->lli_fid));
 145                 GOTO(out, rc = 0);
 146         }
 147
 148         OBD_ALLOC_PTR(op_data);
 149         /* We leak openhandle and request here on error, but not much to be
 150          * done in OOM case since app won't retry close on error either. */
 151         if (op_data == NULL)
 152                 GOTO(out, rc = -ENOMEM);
 153
 154         ll_prepare_close(inode, op_data, och);
 155         switch (bias) {
 156         case MDS_CLOSE_LAYOUT_SWAP:
 157                 LASSERT(data != NULL);
 158                 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
 159                 op_data->op_data_version = 0;
 160                 op_data->op_lease_handle = och->och_lease_handle;
 161                 op_data->op_fid2 = *ll_inode2fid(data);
 162                 break;
 163
 164         case MDS_HSM_RELEASE:
 165                 LASSERT(data != NULL);
 166                 op_data->op_bias |= MDS_HSM_RELEASE;
 167                 op_data->op_data_version = *(__u64 *)data;
 168                 op_data->op_lease_handle = och->och_lease_handle;
 169                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 170                 break;
 171
 172         default:
 173                 LASSERT(data == NULL);
 174                 break;
 175         }
 176
 177         rc = md_close(md_exp, op_data, och->och_mod, &req);
 178         if (rc != 0 && rc != -EINTR)
 179                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 180                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 181
 182         if (rc == 0 &&
 183             op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
 184                 struct mdt_body *body;
 185
 186                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 187                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 188                         rc = -EBUSY;
 189         }
 190
 191         ll_finish_md_op_data(op_data);
 192         EXIT;
 193 out:
 194
 195         md_clear_open_replay_data(md_exp, och);
 196         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 197         OBD_FREE_PTR(och);
 198
 199         ptlrpc_req_finished(req);       /* This is close request */
 200         return rc;
 201 }
 202
 203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 204 {
 205         struct ll_inode_info *lli = ll_i2info(inode);
 206         struct obd_client_handle **och_p;
 207         struct obd_client_handle *och;
 208         __u64 *och_usecount;
 209         int rc = 0;
 210         ENTRY;
 211
 212         if (fmode & FMODE_WRITE) {
 213                 och_p = &lli->lli_mds_write_och;
 214                 och_usecount = &lli->lli_open_fd_write_count;
 215         } else if (fmode & FMODE_EXEC) {
 216                 och_p = &lli->lli_mds_exec_och;
 217                 och_usecount = &lli->lli_open_fd_exec_count;
 218         } else {
 219                 LASSERT(fmode & FMODE_READ);
 220                 och_p = &lli->lli_mds_read_och;
 221                 och_usecount = &lli->lli_open_fd_read_count;
 222         }
 223
 224         mutex_lock(&lli->lli_och_mutex);
 225         if (*och_usecount > 0) {
 226                 /* There are still users of this handle, so skip
 227                  * freeing it. */
 228                 mutex_unlock(&lli->lli_och_mutex);
 229                 RETURN(0);
 230         }
 231
 232         och = *och_p;
 233         *och_p = NULL;
 234         mutex_unlock(&lli->lli_och_mutex);
 235
 236         if (och != NULL) {
 237                 /* There might be a race and this handle may already
 238                  * be closed. */
 239                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 240         }
 241
 242         RETURN(rc);
 243 }
 244
 245 static int ll_md_close(struct inode *inode, struct file *file)
 246 {
 247         union ldlm_policy_data policy = {
 248                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 249         };
 250         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 251         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 252         struct ll_inode_info *lli = ll_i2info(inode);
 253         struct lustre_handle lockh;
 254         enum ldlm_mode lockmode;
 255         int rc = 0;
 256         ENTRY;
 257
 258         /* clear group lock, if present */
 259         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 260                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 261
 262         if (fd->fd_lease_och != NULL) {
 263                 bool lease_broken;
 264
 265                 /* Usually the lease is not released when the
 266                  * application crashed, we need to release here. */
 267                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 268                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 269                         PFID(&lli->lli_fid), rc, lease_broken);
 270
 271                 fd->fd_lease_och = NULL;
 272         }
 273
 274         if (fd->fd_och != NULL) {
 275                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 276                 fd->fd_och = NULL;
 277                 GOTO(out, rc);
 278         }
 279
 280         /* Let's see if we have good enough OPEN lock on the file and if
 281            we can skip talking to MDS */
 282         mutex_lock(&lli->lli_och_mutex);
 283         if (fd->fd_omode & FMODE_WRITE) {
 284                 lockmode = LCK_CW;
 285                 LASSERT(lli->lli_open_fd_write_count);
 286                 lli->lli_open_fd_write_count--;
 287         } else if (fd->fd_omode & FMODE_EXEC) {
 288                 lockmode = LCK_PR;
 289                 LASSERT(lli->lli_open_fd_exec_count);
 290                 lli->lli_open_fd_exec_count--;
 291         } else {
 292                 lockmode = LCK_CR;
 293                 LASSERT(lli->lli_open_fd_read_count);
 294                 lli->lli_open_fd_read_count--;
 295         }
 296         mutex_unlock(&lli->lli_och_mutex);
 297
 298         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 299                            LDLM_IBITS, &policy, lockmode, &lockh))
 300                 rc = ll_md_real_close(inode, fd->fd_omode);
 301
 302 out:
 303         LUSTRE_FPRIVATE(file) = NULL;
 304         ll_file_data_put(fd);
 305
 306         RETURN(rc);
 307 }
 308
 309 /* While this returns an error code, fput() the caller does not, so we need
 310  * to make every effort to clean up all of our state here.  Also, applications
 311  * rarely check close errors and even if an error is returned they will not
 312  * re-try the close call.
 313  */
 314 int ll_file_release(struct inode *inode, struct file *file)
 315 {
 316         struct ll_file_data *fd;
 317         struct ll_sb_info *sbi = ll_i2sbi(inode);
 318         struct ll_inode_info *lli = ll_i2info(inode);
 319         int rc;
 320         ENTRY;
 321
 322         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 323                PFID(ll_inode2fid(inode)), inode);
 324
 325         if (inode->i_sb->s_root != file_dentry(file))
 326                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 327         fd = LUSTRE_FPRIVATE(file);
 328         LASSERT(fd != NULL);
 329
 330         /* The last ref on @file, maybe not the the owner pid of statahead,
 331          * because parent and child process can share the same file handle. */
 332         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 333                 ll_deauthorize_statahead(inode, fd);
 334
 335         if (inode->i_sb->s_root == file_dentry(file)) {
 336                 LUSTRE_FPRIVATE(file) = NULL;
 337                 ll_file_data_put(fd);
 338                 RETURN(0);
 339         }
 340
 341         if (!S_ISDIR(inode->i_mode)) {
 342                 if (lli->lli_clob != NULL)
 343                         lov_read_and_clear_async_rc(lli->lli_clob);
 344                 lli->lli_async_rc = 0;
 345         }
 346
 347         rc = ll_md_close(inode, file);
 348
 349         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 350                 libcfs_debug_dumplog();
 351
 352         RETURN(rc);
 353 }
 354
 355 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
 356                                 struct lookup_intent *itp)
 357 {
 358         struct dentry *de = file_dentry(file);
 359         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 360         struct dentry *parent = de->d_parent;
 361         const char *name = NULL;
 362         int len = 0;
 363         struct md_op_data *op_data;
 364         struct ptlrpc_request *req = NULL;
 365         int rc;
 366         ENTRY;
 367
 368         LASSERT(parent != NULL);
 369         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 370
 371         /* if server supports open-by-fid, or file name is invalid, don't pack
 372          * name in open request */
 373         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 374             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 375                 name = de->d_name.name;
 376                 len = de->d_name.len;
 377         }
 378
 379         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 380                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 381         if (IS_ERR(op_data))
 382                 RETURN(PTR_ERR(op_data));
 383         op_data->op_data = lmm;
 384         op_data->op_data_size = lmmsize;
 385
 386         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 387                             &ll_md_blocking_ast, 0);
 388         ll_finish_md_op_data(op_data);
 389         if (rc == -ESTALE) {
 390                 /* reason for keep own exit path - don`t flood log
 391                  * with messages with -ESTALE errors.
 392                  */
 393                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 394                      it_open_error(DISP_OPEN_OPEN, itp))
 395                         GOTO(out, rc);
 396                 ll_release_openhandle(de, itp);
 397                 GOTO(out, rc);
 398         }
 399
 400         if (it_disposition(itp, DISP_LOOKUP_NEG))
 401                 GOTO(out, rc = -ENOENT);
 402
 403         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 404                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 405                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 406                 GOTO(out, rc);
 407         }
 408
 409         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 410         if (!rc && itp->it_lock_mode)
 411                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 412
 413 out:
 414         ptlrpc_req_finished(req);
 415         ll_intent_drop_lock(itp);
 416
 417         RETURN(rc);
 418 }
 419
 420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 421                        struct obd_client_handle *och)
 422 {
 423         struct mdt_body *body;
 424
 425         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 426         och->och_fh = body->mbo_handle;
 427         och->och_fid = body->mbo_fid1;
 428         och->och_lease_handle.cookie = it->it_lock_handle;
 429         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 430         och->och_flags = it->it_flags;
 431
 432         return md_set_open_replay_data(md_exp, och, it);
 433 }
 434
 435 static int ll_local_open(struct file *file, struct lookup_intent *it,
 436                          struct ll_file_data *fd, struct obd_client_handle *och)
 437 {
 438         struct inode *inode = file_inode(file);
 439         ENTRY;
 440
 441         LASSERT(!LUSTRE_FPRIVATE(file));
 442
 443         LASSERT(fd != NULL);
 444
 445         if (och) {
 446                 int rc;
 447
 448                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 449                 if (rc != 0)
 450                         RETURN(rc);
 451         }
 452
 453         LUSTRE_FPRIVATE(file) = fd;
 454         ll_readahead_init(inode, &fd->fd_ras);
 455         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 456
 457         /* ll_cl_context initialize */
 458         rwlock_init(&fd->fd_lock);
 459         INIT_LIST_HEAD(&fd->fd_lccs);
 460
 461         RETURN(0);
 462 }
 463
 464 /* Open a file, and (for the very first open) create objects on the OSTs at
 465  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 466  * creation or open until ll_lov_setstripe() ioctl is called.
 467  *
 468  * If we already have the stripe MD locally then we don't request it in
 469  * md_open(), by passing a lmm_size = 0.
 470  *
 471  * It is up to the application to ensure no other processes open this file
 472  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 473  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 474  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 475  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 476  */
 477 int ll_file_open(struct inode *inode, struct file *file)
 478 {
 479         struct ll_inode_info *lli = ll_i2info(inode);
 480         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 481                                           .it_flags = file->f_flags };
 482         struct obd_client_handle **och_p = NULL;
 483         __u64 *och_usecount = NULL;
 484         struct ll_file_data *fd;
 485         int rc = 0;
 486         ENTRY;
 487
 488         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 489                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 490
 491         it = file->private_data; /* XXX: compat macro */
 492         file->private_data = NULL; /* prevent ll_local_open assertion */
 493
 494         fd = ll_file_data_get();
 495         if (fd == NULL)
 496                 GOTO(out_openerr, rc = -ENOMEM);
 497
 498         fd->fd_file = file;
 499         if (S_ISDIR(inode->i_mode))
 500                 ll_authorize_statahead(inode, fd);
 501
 502         if (inode->i_sb->s_root == file_dentry(file)) {
 503                 LUSTRE_FPRIVATE(file) = fd;
 504                 RETURN(0);
 505         }
 506
 507         if (!it || !it->it_disposition) {
 508                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 509                  * because everything but O_ACCMODE mask was stripped from
 510                  * there */
 511                 if ((oit.it_flags + 1) & O_ACCMODE)
 512                         oit.it_flags++;
 513                 if (file->f_flags & O_TRUNC)
 514                         oit.it_flags |= FMODE_WRITE;
 515
 516                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 517                  * dentry_open after call to open_namei that checks permissions.
 518                  * Only nfsd_open call dentry_open directly without checking
 519                  * permissions and because of that this code below is safe. */
 520                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 521                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 522
 523                 /* We do not want O_EXCL here, presumably we opened the file
 524                  * already? XXX - NFS implications? */
 525                 oit.it_flags &= ~O_EXCL;
 526
 527                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 528                  * created if necessary, then "IT_CREAT" should be set to keep
 529                  * consistent with it */
 530                 if (oit.it_flags & O_CREAT)
 531                         oit.it_op |= IT_CREAT;
 532
 533                 it = &oit;
 534         }
 535
 536 restart:
 537         /* Let's see if we have file open on MDS already. */
 538         if (it->it_flags & FMODE_WRITE) {
 539                 och_p = &lli->lli_mds_write_och;
 540                 och_usecount = &lli->lli_open_fd_write_count;
 541         } else if (it->it_flags & FMODE_EXEC) {
 542                 och_p = &lli->lli_mds_exec_och;
 543                 och_usecount = &lli->lli_open_fd_exec_count;
 544          } else {
 545                 och_p = &lli->lli_mds_read_och;
 546                 och_usecount = &lli->lli_open_fd_read_count;
 547         }
 548
 549         mutex_lock(&lli->lli_och_mutex);
 550         if (*och_p) { /* Open handle is present */
 551                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 552                         /* Well, there's extra open request that we do not need,
 553                            let's close it somehow. This will decref request. */
 554                         rc = it_open_error(DISP_OPEN_OPEN, it);
 555                         if (rc) {
 556                                 mutex_unlock(&lli->lli_och_mutex);
 557                                 GOTO(out_openerr, rc);
 558                         }
 559
 560                         ll_release_openhandle(file_dentry(file), it);
 561                 }
 562                 (*och_usecount)++;
 563
 564                 rc = ll_local_open(file, it, fd, NULL);
 565                 if (rc) {
 566                         (*och_usecount)--;
 567                         mutex_unlock(&lli->lli_och_mutex);
 568                         GOTO(out_openerr, rc);
 569                 }
 570         } else {
 571                 LASSERT(*och_usecount == 0);
 572                 if (!it->it_disposition) {
 573                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 574                         /* We cannot just request lock handle now, new ELC code
 575                            means that one of other OPEN locks for this file
 576                            could be cancelled, and since blocking ast handler
 577                            would attempt to grab och_mutex as well, that would
 578                            result in a deadlock */
 579                         mutex_unlock(&lli->lli_och_mutex);
 580                         /*
 581                          * Normally called under two situations:
 582                          * 1. NFS export.
 583                          * 2. A race/condition on MDS resulting in no open
 584                          *    handle to be returned from LOOKUP|OPEN request,
 585                          *    for example if the target entry was a symlink.
 586                          *
 587                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 588                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 589                          *  bit so that it's not confusing later callers.
 590                          *
 591                          *  NB; when ldd is NULL, it must have come via normal
 592                          *  lookup path only, since ll_iget_for_nfs always calls
 593                          *  ll_d_init().
 594                          */
 595                         if (ldd && ldd->lld_nfs_dentry) {
 596                                 ldd->lld_nfs_dentry = 0;
 597                                 it->it_flags |= MDS_OPEN_LOCK;
 598                         }
 599
 600                          /*
 601                          * Always specify MDS_OPEN_BY_FID because we don't want
 602                          * to get file with different fid.
 603                          */
 604                         it->it_flags |= MDS_OPEN_BY_FID;
 605                         rc = ll_intent_file_open(file, NULL, 0, it);
 606                         if (rc)
 607                                 GOTO(out_openerr, rc);
 608
 609                         goto restart;
 610                 }
 611                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 612                 if (!*och_p)
 613                         GOTO(out_och_free, rc = -ENOMEM);
 614
 615                 (*och_usecount)++;
 616
 617                 /* md_intent_lock() didn't get a request ref if there was an
 618                  * open error, so don't do cleanup on the request here
 619                  * (bug 3430) */
 620                 /* XXX (green): Should not we bail out on any error here, not
 621                  * just open error? */
 622                 rc = it_open_error(DISP_OPEN_OPEN, it);
 623                 if (rc != 0)
 624                         GOTO(out_och_free, rc);
 625
 626                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 627                          "inode %p: disposition %x, status %d\n", inode,
 628                          it_disposition(it, ~0), it->it_status);
 629
 630                 rc = ll_local_open(file, it, fd, *och_p);
 631                 if (rc)
 632                         GOTO(out_och_free, rc);
 633         }
 634         mutex_unlock(&lli->lli_och_mutex);
 635         fd = NULL;
 636
 637         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 638            different kind of OPEN lock for this same inode gets cancelled
 639            by ldlm_cancel_lru */
 640         if (!S_ISREG(inode->i_mode))
 641                 GOTO(out_och_free, rc);
 642
 643         cl_lov_delay_create_clear(&file->f_flags);
 644         GOTO(out_och_free, rc);
 645
 646 out_och_free:
 647         if (rc) {
 648                 if (och_p && *och_p) {
 649                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 650                         *och_p = NULL; /* OBD_FREE writes some magic there */
 651                         (*och_usecount)--;
 652                 }
 653                 mutex_unlock(&lli->lli_och_mutex);
 654
 655 out_openerr:
 656                 if (lli->lli_opendir_key == fd)
 657                         ll_deauthorize_statahead(inode, fd);
 658                 if (fd != NULL)
 659                         ll_file_data_put(fd);
 660         } else {
 661                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 662         }
 663
 664         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 665                 ptlrpc_req_finished(it->it_request);
 666                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 667         }
 668
 669         return rc;
 670 }
 671
 672 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 673                         struct ldlm_lock_desc *desc, void *data, int flag)
 674 {
 675         int rc;
 676         struct lustre_handle lockh;
 677         ENTRY;
 678
 679         switch (flag) {
 680         case LDLM_CB_BLOCKING:
 681                 ldlm_lock2handle(lock, &lockh);
 682                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 683                 if (rc < 0) {
 684                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 685                         RETURN(rc);
 686                 }
 687                 break;
 688         case LDLM_CB_CANCELING:
 689                 /* do nothing */
 690                 break;
 691         }
 692         RETURN(0);
 693 }
 694
 695 /**
 696  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 697  * and save it as fd->fd_och so as to force client to reopen the file even
 698  * if it has an open lock in cache already.
 699  */
 700 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 701                                 struct lustre_handle *old_handle)
 702 {
 703         struct ll_inode_info *lli = ll_i2info(inode);
 704         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 705         struct obd_client_handle **och_p;
 706         __u64 *och_usecount;
 707         int rc = 0;
 708         ENTRY;
 709
 710         /* Get the openhandle of the file */
 711         mutex_lock(&lli->lli_och_mutex);
 712         if (fd->fd_lease_och != NULL)
 713                 GOTO(out_unlock, rc = -EBUSY);
 714
 715         if (fd->fd_och == NULL) {
 716                 if (file->f_mode & FMODE_WRITE) {
 717                         LASSERT(lli->lli_mds_write_och != NULL);
 718                         och_p = &lli->lli_mds_write_och;
 719                         och_usecount = &lli->lli_open_fd_write_count;
 720                 } else {
 721                         LASSERT(lli->lli_mds_read_och != NULL);
 722                         och_p = &lli->lli_mds_read_och;
 723                         och_usecount = &lli->lli_open_fd_read_count;
 724                 }
 725
 726                 if (*och_usecount > 1)
 727                         GOTO(out_unlock, rc = -EBUSY);
 728
 729                 fd->fd_och = *och_p;
 730                 *och_usecount = 0;
 731                 *och_p = NULL;
 732         }
 733
 734         *old_handle = fd->fd_och->och_fh;
 735
 736         EXIT;
 737 out_unlock:
 738         mutex_unlock(&lli->lli_och_mutex);
 739         return rc;
 740 }
 741
 742 /**
 743  * Release ownership on lli_mds_*_och when putting back a file lease.
 744  */
 745 static int ll_lease_och_release(struct inode *inode, struct file *file)
 746 {
 747         struct ll_inode_info *lli = ll_i2info(inode);
 748         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 749         struct obd_client_handle **och_p;
 750         struct obd_client_handle *old_och = NULL;
 751         __u64 *och_usecount;
 752         int rc = 0;
 753         ENTRY;
 754
 755         mutex_lock(&lli->lli_och_mutex);
 756         if (file->f_mode & FMODE_WRITE) {
 757                 och_p = &lli->lli_mds_write_och;
 758                 och_usecount = &lli->lli_open_fd_write_count;
 759         } else {
 760                 och_p = &lli->lli_mds_read_och;
 761                 och_usecount = &lli->lli_open_fd_read_count;
 762         }
 763
 764         /* The file may have been open by another process (broken lease) so
 765          * *och_p is not NULL. In this case we should simply increase usecount
 766          * and close fd_och.
 767          */
 768         if (*och_p != NULL) {
 769                 old_och = fd->fd_och;
 770                 (*och_usecount)++;
 771         } else {
 772                 *och_p = fd->fd_och;
 773                 *och_usecount = 1;
 774         }
 775         fd->fd_och = NULL;
 776         mutex_unlock(&lli->lli_och_mutex);
 777
 778         if (old_och != NULL)
 779                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 780
 781         RETURN(rc);
 782 }
 783
 784 /**
 785  * Acquire a lease and open the file.
 786  */
 787 static struct obd_client_handle *
 788 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 789               __u64 open_flags)
 790 {
 791         struct lookup_intent it = { .it_op = IT_OPEN };
 792         struct ll_sb_info *sbi = ll_i2sbi(inode);
 793         struct md_op_data *op_data;
 794         struct ptlrpc_request *req = NULL;
 795         struct lustre_handle old_handle = { 0 };
 796         struct obd_client_handle *och = NULL;
 797         int rc;
 798         int rc2;
 799         ENTRY;
 800
 801         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 802                 RETURN(ERR_PTR(-EINVAL));
 803
 804         if (file != NULL) {
 805                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 806                         RETURN(ERR_PTR(-EPERM));
 807
 808                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 809                 if (rc)
 810                         RETURN(ERR_PTR(rc));
 811         }
 812
 813         OBD_ALLOC_PTR(och);
 814         if (och == NULL)
 815                 RETURN(ERR_PTR(-ENOMEM));
 816
 817         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 818                                         LUSTRE_OPC_ANY, NULL);
 819         if (IS_ERR(op_data))
 820                 GOTO(out, rc = PTR_ERR(op_data));
 821
 822         /* To tell the MDT this openhandle is from the same owner */
 823         op_data->op_handle = old_handle;
 824
 825         it.it_flags = fmode | open_flags;
 826         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 827         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 828                             &ll_md_blocking_lease_ast,
 829         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 830          * it can be cancelled which may mislead applications that the lease is
 831          * broken;
 832          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 833          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 834          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 835                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 836         ll_finish_md_op_data(op_data);
 837         ptlrpc_req_finished(req);
 838         if (rc < 0)
 839                 GOTO(out_release_it, rc);
 840
 841         if (it_disposition(&it, DISP_LOOKUP_NEG))
 842                 GOTO(out_release_it, rc = -ENOENT);
 843
 844         rc = it_open_error(DISP_OPEN_OPEN, &it);
 845         if (rc)
 846                 GOTO(out_release_it, rc);
 847
 848         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 849         ll_och_fill(sbi->ll_md_exp, &it, och);
 850
 851         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 852                 GOTO(out_close, rc = -EOPNOTSUPP);
 853
 854         /* already get lease, handle lease lock */
 855         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 856         if (it.it_lock_mode == 0 ||
 857             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 858                 /* open lock must return for lease */
 859                 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
 860                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 861                         it.it_lock_bits);
 862                 GOTO(out_close, rc = -EPROTO);
 863         }
 864
 865         ll_intent_release(&it);
 866         RETURN(och);
 867
 868 out_close:
 869         /* Cancel open lock */
 870         if (it.it_lock_mode != 0) {
 871                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 872                                             it.it_lock_mode);
 873                 it.it_lock_mode = 0;
 874                 och->och_lease_handle.cookie = 0ULL;
 875         }
 876         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 877         if (rc2 < 0)
 878                 CERROR("%s: error closing file "DFID": %d\n",
 879                        ll_get_fsname(inode->i_sb, NULL, 0),
 880                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 881         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 882 out_release_it:
 883         ll_intent_release(&it);
 884 out:
 885         if (och != NULL)
 886                 OBD_FREE_PTR(och);
 887         RETURN(ERR_PTR(rc));
 888 }
 889
 890 /**
 891  * Check whether a layout swap can be done between two inodes.
 892  *
 893  * \param[in] inode1  First inode to check
 894  * \param[in] inode2  Second inode to check
 895  *
 896  * \retval 0 on success, layout swap can be performed between both inodes
 897  * \retval negative error code if requirements are not met
 898  */
 899 static int ll_check_swap_layouts_validity(struct inode *inode1,
 900                                           struct inode *inode2)
 901 {
 902         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 903                 return -EINVAL;
 904
 905         if (inode_permission(inode1, MAY_WRITE) ||
 906             inode_permission(inode2, MAY_WRITE))
 907                 return -EPERM;
 908
 909         if (inode1->i_sb != inode2->i_sb)
 910                 return -EXDEV;
 911
 912         return 0;
 913 }
 914
 915 static int ll_swap_layouts_close(struct obd_client_handle *och,
 916                                  struct inode *inode, struct inode *inode2)
 917 {
 918         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 919         const struct lu_fid     *fid2;
 920         int                      rc;
 921         ENTRY;
 922
 923         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 924                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 925
 926         rc = ll_check_swap_layouts_validity(inode, inode2);
 927         if (rc < 0)
 928                 GOTO(out_free_och, rc);
 929
 930         /* We now know that inode2 is a lustre inode */
 931         fid2 = ll_inode2fid(inode2);
 932
 933         rc = lu_fid_cmp(fid1, fid2);
 934         if (rc == 0)
 935                 GOTO(out_free_och, rc = -EINVAL);
 936
 937         /* Close the file and swap layouts between inode & inode2.
 938          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 939          * because we still need it to pack l_remote_handle to MDT. */
 940         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 941                                        inode2);
 942
 943         och = NULL; /* freed in ll_close_inode_openhandle() */
 944
 945 out_free_och:
 946         if (och != NULL)
 947                 OBD_FREE_PTR(och);
 948
 949         RETURN(rc);
 950 }
 951
 952 /**
 953  * Release lease and close the file.
 954  * It will check if the lease has ever broken.
 955  */
 956 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 957                           bool *lease_broken)
 958 {
 959         struct ldlm_lock *lock;
 960         bool cancelled = true;
 961         int rc;
 962         ENTRY;
 963
 964         lock = ldlm_handle2lock(&och->och_lease_handle);
 965         if (lock != NULL) {
 966                 lock_res_and_lock(lock);
 967                 cancelled = ldlm_is_cancel(lock);
 968                 unlock_res_and_lock(lock);
 969                 LDLM_LOCK_PUT(lock);
 970         }
 971
 972         CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 973                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 974
 975         if (!cancelled)
 976                 ldlm_cli_cancel(&och->och_lease_handle, 0);
 977
 978         if (lease_broken != NULL)
 979                 *lease_broken = cancelled;
 980
 981         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 982         RETURN(rc);
 983 }
 984
 985 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 986 {
 987         struct ll_inode_info *lli = ll_i2info(inode);
 988         struct cl_object *obj = lli->lli_clob;
 989         struct cl_attr *attr = vvp_env_thread_attr(env);
 990         s64 atime;
 991         s64 mtime;
 992         s64 ctime;
 993         int rc = 0;
 994
 995         ENTRY;
 996
 997         ll_inode_size_lock(inode);
 998
 999         /* Merge timestamps the most recently obtained from MDS with
1000          * timestamps obtained from OSTs.
1001          *
1002          * Do not overwrite atime of inode because it may be refreshed
1003          * by file_accessed() function. If the read was served by cache
1004          * data, there is no RPC to be sent so that atime may not be
1005          * transferred to OSTs at all. MDT only updates atime at close time
1006          * if it's at least 'mdd.*.atime_diff' older.
1007          * All in all, the atime in Lustre does not strictly comply with
1008          * POSIX. Solving this problem needs to send an RPC to MDT for each
1009          * read, this will hurt performance. */
1010         if (LTIME_S(inode->i_atime) < lli->lli_atime)
1011                 LTIME_S(inode->i_atime) = lli->lli_atime;
1012         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1013         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1014
1015         atime = LTIME_S(inode->i_atime);
1016         mtime = LTIME_S(inode->i_mtime);
1017         ctime = LTIME_S(inode->i_ctime);
1018
1019         cl_object_attr_lock(obj);
1020         rc = cl_object_attr_get(env, obj, attr);
1021         cl_object_attr_unlock(obj);
1022
1023         if (rc != 0)
1024                 GOTO(out_size_unlock, rc);
1025
1026         if (atime < attr->cat_atime)
1027                 atime = attr->cat_atime;
1028
1029         if (ctime < attr->cat_ctime)
1030                 ctime = attr->cat_ctime;
1031
1032         if (mtime < attr->cat_mtime)
1033                 mtime = attr->cat_mtime;
1034
1035         CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1036                PFID(&lli->lli_fid), attr->cat_size);
1037
1038         i_size_write(inode, attr->cat_size);
1039         inode->i_blocks = attr->cat_blocks;
1040
1041         LTIME_S(inode->i_atime) = atime;
1042         LTIME_S(inode->i_mtime) = mtime;
1043         LTIME_S(inode->i_ctime) = ctime;
1044
1045 out_size_unlock:
1046         ll_inode_size_unlock(inode);
1047
1048         RETURN(rc);
1049 }
1050
1051 static bool file_is_noatime(const struct file *file)
1052 {
1053         const struct vfsmount *mnt = file->f_path.mnt;
1054         const struct inode *inode = file_inode((struct file *)file);
1055
1056         /* Adapted from file_accessed() and touch_atime().*/
1057         if (file->f_flags & O_NOATIME)
1058                 return true;
1059
1060         if (inode->i_flags & S_NOATIME)
1061                 return true;
1062
1063         if (IS_NOATIME(inode))
1064                 return true;
1065
1066         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1067                 return true;
1068
1069         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1070                 return true;
1071
1072         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1073                 return true;
1074
1075         return false;
1076 }
1077
1078 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1079 {
1080         struct inode *inode = file_inode((struct file *)file);
1081
1082         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1083         if (write) {
1084                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1085                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1086                                       file->f_flags & O_DIRECT ||
1087                                       IS_SYNC(inode);
1088         }
1089         io->ci_obj     = ll_i2info(inode)->lli_clob;
1090         io->ci_lockreq = CILR_MAYBE;
1091         if (ll_file_nolock(file)) {
1092                 io->ci_lockreq = CILR_NEVER;
1093                 io->ci_no_srvlock = 1;
1094         } else if (file->f_flags & O_APPEND) {
1095                 io->ci_lockreq = CILR_MANDATORY;
1096         }
1097
1098         io->ci_noatime = file_is_noatime(file);
1099 }
1100
1101 static ssize_t
1102 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1103                    struct file *file, enum cl_io_type iot,
1104                    loff_t *ppos, size_t count)
1105 {
1106         struct vvp_io           *vio = vvp_env_io(env);
1107         struct inode            *inode = file_inode(file);
1108         struct ll_inode_info    *lli = ll_i2info(inode);
1109         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1110         struct cl_io            *io;
1111         ssize_t                 result = 0;
1112         int                     rc = 0;
1113         struct range_lock       range;
1114
1115         ENTRY;
1116
1117         CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1118                 file_dentry(file)->d_name.name, iot, *ppos, count);
1119
1120 restart:
1121         io = vvp_env_thread_io(env);
1122         ll_io_init(io, file, iot == CIT_WRITE);
1123
1124         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1125                 bool range_locked = false;
1126
1127                 if (file->f_flags & O_APPEND)
1128                         range_lock_init(&range, 0, LUSTRE_EOF);
1129                 else
1130                         range_lock_init(&range, *ppos, *ppos + count - 1);
1131
1132                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1133                 vio->vui_io_subtype = args->via_io_subtype;
1134
1135                 switch (vio->vui_io_subtype) {
1136                 case IO_NORMAL:
1137                         vio->vui_iter = args->u.normal.via_iter;
1138                         vio->vui_iocb = args->u.normal.via_iocb;
1139                         /* Direct IO reads must also take range lock,
1140                          * or multiple reads will try to work on the same pages
1141                          * See LU-6227 for details. */
1142                         if (((iot == CIT_WRITE) ||
1143                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1144                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1145                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1146                                        RL_PARA(&range));
1147                                 rc = range_lock(&lli->lli_write_tree, &range);
1148                                 if (rc < 0)
1149                                         GOTO(out, rc);
1150
1151                                 range_locked = true;
1152                         }
1153                         break;
1154                 case IO_SPLICE:
1155                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1156                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1157                         break;
1158                 default:
1159                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1160                         LBUG();
1161                 }
1162
1163                 ll_cl_add(file, env, io, LCC_RW);
1164                 rc = cl_io_loop(env, io);
1165                 ll_cl_remove(file, env);
1166
1167                 if (range_locked) {
1168                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1169                                RL_PARA(&range));
1170                         range_unlock(&lli->lli_write_tree, &range);
1171                 }
1172         } else {
1173                 /* cl_io_rw_init() handled IO */
1174                 rc = io->ci_result;
1175         }
1176
1177         if (io->ci_nob > 0) {
1178                 result += io->ci_nob;
1179                 count -= io->ci_nob;
1180                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1181
1182                 /* prepare IO restart */
1183                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1184                         args->u.normal.via_iter = vio->vui_iter;
1185         }
1186         GOTO(out, rc);
1187 out:
1188         cl_io_fini(env, io);
1189
1190         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1191                 CDEBUG(D_VFSTRACE,
1192                        "%s: restart %s from %lld, count:%zu, result: %zd\n",
1193                        file_dentry(file)->d_name.name,
1194                        iot == CIT_READ ? "read" : "write",
1195                        *ppos, count, result);
1196                 goto restart;
1197         }
1198
1199         if (iot == CIT_READ) {
1200                 if (result > 0)
1201                         ll_stats_ops_tally(ll_i2sbi(inode),
1202                                            LPROC_LL_READ_BYTES, result);
1203         } else if (iot == CIT_WRITE) {
1204                 if (result > 0) {
1205                         ll_stats_ops_tally(ll_i2sbi(inode),
1206                                            LPROC_LL_WRITE_BYTES, result);
1207                         fd->fd_write_failed = false;
1208                 } else if (result == 0 && rc == 0) {
1209                         rc = io->ci_result;
1210                         if (rc < 0)
1211                                 fd->fd_write_failed = true;
1212                         else
1213                                 fd->fd_write_failed = false;
1214                 } else if (rc != -ERESTARTSYS) {
1215                         fd->fd_write_failed = true;
1216                 }
1217         }
1218
1219         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1220
1221         return result > 0 ? result : rc;
1222 }
1223
1224 /**
1225  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1226  * especially for small I/O.
1227  *
1228  * To serve a read request, CLIO has to create and initialize a cl_io and
1229  * then request DLM lock. This has turned out to have siginificant overhead
1230  * and affects the performance of small I/O dramatically.
1231  *
1232  * It's not necessary to create a cl_io for each I/O. Under the help of read
1233  * ahead, most of the pages being read are already in memory cache and we can
1234  * read those pages directly because if the pages exist, the corresponding DLM
1235  * lock must exist so that page content must be valid.
1236  *
1237  * In fast read implementation, the llite speculatively finds and reads pages
1238  * in memory cache. There are three scenarios for fast read:
1239  *   - If the page exists and is uptodate, kernel VM will provide the data and
1240  *     CLIO won't be intervened;
1241  *   - If the page was brought into memory by read ahead, it will be exported
1242  *     and read ahead parameters will be updated;
1243  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1244  *     it will go back and invoke normal read, i.e., a cl_io will be created
1245  *     and DLM lock will be requested.
1246  *
1247  * POSIX compliance: posix standard states that read is intended to be atomic.
1248  * Lustre read implementation is in line with Linux kernel read implementation
1249  * and neither of them complies with POSIX standard in this matter. Fast read
1250  * doesn't make the situation worse on single node but it may interleave write
1251  * results from multiple nodes due to short read handling in ll_file_aio_read().
1252  *
1253  * \param env - lu_env
1254  * \param iocb - kiocb from kernel
1255  * \param iter - user space buffers where the data will be copied
1256  *
1257  * \retval - number of bytes have been read, or error code if error occurred.
1258  */
1259 static ssize_t
1260 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1261                 struct iov_iter *iter)
1262 {
1263         ssize_t result;
1264
1265         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1266                 return 0;
1267
1268         /* NB: we can't do direct IO for fast read because it will need a lock
1269          * to make IO engine happy. */
1270         if (iocb->ki_filp->f_flags & O_DIRECT)
1271                 return 0;
1272
1273         ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1274         result = generic_file_read_iter(iocb, iter);
1275         ll_cl_remove(iocb->ki_filp, env);
1276
1277         /* If the first page is not in cache, generic_file_aio_read() will be
1278          * returned with -ENODATA.
1279          * See corresponding code in ll_readpage(). */
1280         if (result == -ENODATA)
1281                 result = 0;
1282
1283         if (result > 0)
1284                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1285                                 LPROC_LL_READ_BYTES, result);
1286
1287         return result;
1288 }
1289
1290 /*
1291  * Read from a file (through the page cache).
1292  */
1293 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1294 {
1295         struct lu_env *env;
1296         struct vvp_io_args *args;
1297         ssize_t result;
1298         ssize_t rc2;
1299         __u16 refcheck;
1300
1301         env = cl_env_get(&refcheck);
1302         if (IS_ERR(env))
1303                 return PTR_ERR(env);
1304
1305         result = ll_do_fast_read(env, iocb, to);
1306         if (result < 0 || iov_iter_count(to) == 0)
1307                 GOTO(out, result);
1308
1309         args = ll_env_args(env, IO_NORMAL);
1310         args->u.normal.via_iter = to;
1311         args->u.normal.via_iocb = iocb;
1312
1313         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1314                                  &iocb->ki_pos, iov_iter_count(to));
1315         if (rc2 > 0)
1316                 result += rc2;
1317         else if (result == 0)
1318                 result = rc2;
1319
1320 out:
1321         cl_env_put(env, &refcheck);
1322         return result;
1323 }
1324
1325 /*
1326  * Write to a file (through the page cache).
1327  */
1328 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1329 {
1330         struct vvp_io_args *args;
1331         struct lu_env *env;
1332         ssize_t result;
1333         __u16 refcheck;
1334
1335         env = cl_env_get(&refcheck);
1336         if (IS_ERR(env))
1337                 return PTR_ERR(env);
1338
1339         args = ll_env_args(env, IO_NORMAL);
1340         args->u.normal.via_iter = from;
1341         args->u.normal.via_iocb = iocb;
1342
1343         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1344                                     &iocb->ki_pos, iov_iter_count(from));
1345         cl_env_put(env, &refcheck);
1346         return result;
1347 }
1348
1349 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1350 /*
1351  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1352  */
1353 static int ll_file_get_iov_count(const struct iovec *iov,
1354                                  unsigned long *nr_segs, size_t *count)
1355 {
1356         size_t cnt = 0;
1357         unsigned long seg;
1358
1359         for (seg = 0; seg < *nr_segs; seg++) {
1360                 const struct iovec *iv = &iov[seg];
1361
1362                 /*
1363                  * If any segment has a negative length, or the cumulative
1364                  * length ever wraps negative then return -EINVAL.
1365                  */
1366                 cnt += iv->iov_len;
1367                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1368                         return -EINVAL;
1369                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1370                         continue;
1371                 if (seg == 0)
1372                         return -EFAULT;
1373                 *nr_segs = seg;
1374                 cnt -= iv->iov_len;     /* This segment is no good */
1375                 break;
1376         }
1377         *count = cnt;
1378         return 0;
1379 }
1380
1381 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1382                                 unsigned long nr_segs, loff_t pos)
1383 {
1384         struct iov_iter to;
1385         size_t iov_count;
1386         ssize_t result;
1387         ENTRY;
1388
1389         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1390         if (result)
1391                 RETURN(result);
1392
1393 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1394         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1395 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1396         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1397 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1398
1399         result = ll_file_read_iter(iocb, &to);
1400
1401         RETURN(result);
1402 }
1403
1404 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1405                             loff_t *ppos)
1406 {
1407         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1408         struct kiocb  *kiocb;
1409         ssize_t        result;
1410         ENTRY;
1411
1412         OBD_ALLOC_PTR(kiocb);
1413         if (kiocb == NULL)
1414                 RETURN(-ENOMEM);
1415
1416         init_sync_kiocb(kiocb, file);
1417         kiocb->ki_pos = *ppos;
1418 #ifdef HAVE_KIOCB_KI_LEFT
1419         kiocb->ki_left = count;
1420 #elif defined(HAVE_KI_NBYTES)
1421         kiocb->ki_nbytes = count;
1422 #endif
1423
1424         result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1425         *ppos = kiocb->ki_pos;
1426
1427         OBD_FREE_PTR(kiocb);
1428         RETURN(result);
1429 }
1430
1431 /*
1432  * Write to a file (through the page cache).
1433  * AIO stuff
1434  */
1435 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1436                                  unsigned long nr_segs, loff_t pos)
1437 {
1438         struct iov_iter from;
1439         size_t iov_count;
1440         ssize_t result;
1441         ENTRY;
1442
1443         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1444         if (result)
1445                 RETURN(result);
1446
1447 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1448         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1449 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1450         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1451 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1452
1453         result = ll_file_write_iter(iocb, &from);
1454
1455         RETURN(result);
1456 }
1457
1458 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1459                              size_t count, loff_t *ppos)
1460 {
1461         struct lu_env *env;
1462         struct iovec   iov = { .iov_base = (void __user *)buf,
1463                                .iov_len = count };
1464         struct kiocb  *kiocb;
1465         ssize_t        result;
1466         __u16          refcheck;
1467         ENTRY;
1468
1469         env = cl_env_get(&refcheck);
1470         if (IS_ERR(env))
1471                 RETURN(PTR_ERR(env));
1472
1473         kiocb = &ll_env_info(env)->lti_kiocb;
1474         init_sync_kiocb(kiocb, file);
1475         kiocb->ki_pos = *ppos;
1476 #ifdef HAVE_KIOCB_KI_LEFT
1477         kiocb->ki_left = count;
1478 #elif defined(HAVE_KI_NBYTES)
1479         kiocb->ki_nbytes = count;
1480 #endif
1481
1482         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1483         *ppos = kiocb->ki_pos;
1484
1485         cl_env_put(env, &refcheck);
1486         RETURN(result);
1487 }
1488 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1489
1490 /*
1491  * Send file content (through pagecache) somewhere with helper
1492  */
1493 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1494                                    struct pipe_inode_info *pipe, size_t count,
1495                                    unsigned int flags)
1496 {
1497         struct lu_env      *env;
1498         struct vvp_io_args *args;
1499         ssize_t             result;
1500         __u16               refcheck;
1501         ENTRY;
1502
1503         env = cl_env_get(&refcheck);
1504         if (IS_ERR(env))
1505                 RETURN(PTR_ERR(env));
1506
1507         args = ll_env_args(env, IO_SPLICE);
1508         args->u.splice.via_pipe = pipe;
1509         args->u.splice.via_flags = flags;
1510
1511         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1512         cl_env_put(env, &refcheck);
1513         RETURN(result);
1514 }
1515
1516 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1517                              __u64  flags, struct lov_user_md *lum,
1518                              int lum_size)
1519 {
1520         struct lookup_intent oit = {
1521                 .it_op = IT_OPEN,
1522                 .it_flags = flags | MDS_OPEN_BY_FID,
1523         };
1524         int rc;
1525         ENTRY;
1526
1527         ll_inode_size_lock(inode);
1528         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1529         if (rc < 0)
1530                 GOTO(out_unlock, rc);
1531
1532         ll_release_openhandle(file_dentry(file), &oit);
1533
1534 out_unlock:
1535         ll_inode_size_unlock(inode);
1536         ll_intent_release(&oit);
1537         cl_lov_delay_create_clear(&file->f_flags);
1538
1539         RETURN(rc);
1540 }
1541
1542 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1543                              struct lov_mds_md **lmmp, int *lmm_size,
1544                              struct ptlrpc_request **request)
1545 {
1546         struct ll_sb_info *sbi = ll_i2sbi(inode);
1547         struct mdt_body  *body;
1548         struct lov_mds_md *lmm = NULL;
1549         struct ptlrpc_request *req = NULL;
1550         struct md_op_data *op_data;
1551         int rc, lmmsize;
1552
1553         rc = ll_get_default_mdsize(sbi, &lmmsize);
1554         if (rc)
1555                 RETURN(rc);
1556
1557         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1558                                      strlen(filename), lmmsize,
1559                                      LUSTRE_OPC_ANY, NULL);
1560         if (IS_ERR(op_data))
1561                 RETURN(PTR_ERR(op_data));
1562
1563         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1564         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1565         ll_finish_md_op_data(op_data);
1566         if (rc < 0) {
1567                 CDEBUG(D_INFO, "md_getattr_name failed "
1568                        "on %s: rc %d\n", filename, rc);
1569                 GOTO(out, rc);
1570         }
1571
1572         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1573         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1574
1575         lmmsize = body->mbo_eadatasize;
1576
1577         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1578                         lmmsize == 0) {
1579                 GOTO(out, rc = -ENODATA);
1580         }
1581
1582         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1583         LASSERT(lmm != NULL);
1584
1585         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1586             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1587                 GOTO(out, rc = -EPROTO);
1588         }
1589
1590         /*
1591          * This is coming from the MDS, so is probably in
1592          * little endian.  We convert it to host endian before
1593          * passing it to userspace.
1594          */
1595         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1596                 int stripe_count;
1597
1598                 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1599                 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1600                         stripe_count = 0;
1601
1602                 /* if function called for directory - we should
1603                  * avoid swab not existent lsm objects */
1604                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1605                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1606                         if (S_ISREG(body->mbo_mode))
1607                                 lustre_swab_lov_user_md_objects(
1608                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1609                                     stripe_count);
1610                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1611                         lustre_swab_lov_user_md_v3(
1612                                 (struct lov_user_md_v3 *)lmm);
1613                         if (S_ISREG(body->mbo_mode))
1614                                 lustre_swab_lov_user_md_objects(
1615                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1616                                  stripe_count);
1617                 }
1618         }
1619
1620 out:
1621         *lmmp = lmm;
1622         *lmm_size = lmmsize;
1623         *request = req;
1624         return rc;
1625 }
1626
1627 static int ll_lov_setea(struct inode *inode, struct file *file,
1628                             unsigned long arg)
1629 {
1630         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1631         struct lov_user_md      *lump;
1632         int                      lum_size = sizeof(struct lov_user_md) +
1633                                             sizeof(struct lov_user_ost_data);
1634         int                      rc;
1635         ENTRY;
1636
1637         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1638                 RETURN(-EPERM);
1639
1640         OBD_ALLOC_LARGE(lump, lum_size);
1641         if (lump == NULL)
1642                 RETURN(-ENOMEM);
1643
1644         if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1645                 GOTO(out_lump, rc = -EFAULT);
1646
1647         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1648
1649 out_lump:
1650         OBD_FREE_LARGE(lump, lum_size);
1651         RETURN(rc);
1652 }
1653
1654 static int ll_file_getstripe(struct inode *inode,
1655                              struct lov_user_md __user *lum)
1656 {
1657         struct lu_env   *env;
1658         __u16           refcheck;
1659         int             rc;
1660         ENTRY;
1661
1662         env = cl_env_get(&refcheck);
1663         if (IS_ERR(env))
1664                 RETURN(PTR_ERR(env));
1665
1666         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1667         cl_env_put(env, &refcheck);
1668         RETURN(rc);
1669 }
1670
1671 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1672                             unsigned long arg)
1673 {
1674         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1675         struct lov_user_md        *klum;
1676         int                        lum_size, rc;
1677         __u64                      flags = FMODE_WRITE;
1678         ENTRY;
1679
1680         rc = ll_copy_user_md(lum, &klum);
1681         if (rc < 0)
1682                 RETURN(rc);
1683
1684         lum_size = rc;
1685         rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1686         if (rc == 0) {
1687                 __u32 gen;
1688
1689                 put_user(0, &lum->lmm_stripe_count);
1690
1691                 ll_layout_refresh(inode, &gen);
1692                 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1693         }
1694
1695         OBD_FREE(klum, lum_size);
1696         RETURN(rc);
1697 }
1698
1699 static int
1700 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1701 {
1702         struct ll_inode_info   *lli = ll_i2info(inode);
1703         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1704         struct ll_grouplock     grouplock;
1705         int                     rc;
1706         ENTRY;
1707
1708         if (arg == 0) {
1709                 CWARN("group id for group lock must not be 0\n");
1710                 RETURN(-EINVAL);
1711         }
1712
1713         if (ll_file_nolock(file))
1714                 RETURN(-EOPNOTSUPP);
1715
1716         spin_lock(&lli->lli_lock);
1717         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1718                 CWARN("group lock already existed with gid %lu\n",
1719                       fd->fd_grouplock.lg_gid);
1720                 spin_unlock(&lli->lli_lock);
1721                 RETURN(-EINVAL);
1722         }
1723         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1724         spin_unlock(&lli->lli_lock);
1725
1726         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1727                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1728         if (rc)
1729                 RETURN(rc);
1730
1731         spin_lock(&lli->lli_lock);
1732         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1733                 spin_unlock(&lli->lli_lock);
1734                 CERROR("another thread just won the race\n");
1735                 cl_put_grouplock(&grouplock);
1736                 RETURN(-EINVAL);
1737         }
1738
1739         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1740         fd->fd_grouplock = grouplock;
1741         spin_unlock(&lli->lli_lock);
1742
1743         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1744         RETURN(0);
1745 }
1746
1747 static int ll_put_grouplock(struct inode *inode, struct file *file,
1748                             unsigned long arg)
1749 {
1750         struct ll_inode_info   *lli = ll_i2info(inode);
1751         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1752         struct ll_grouplock     grouplock;
1753         ENTRY;
1754
1755         spin_lock(&lli->lli_lock);
1756         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1757                 spin_unlock(&lli->lli_lock);
1758                 CWARN("no group lock held\n");
1759                 RETURN(-EINVAL);
1760         }
1761
1762         LASSERT(fd->fd_grouplock.lg_lock != NULL);
1763
1764         if (fd->fd_grouplock.lg_gid != arg) {
1765                 CWARN("group lock %lu doesn't match current id %lu\n",
1766                       arg, fd->fd_grouplock.lg_gid);
1767                 spin_unlock(&lli->lli_lock);
1768                 RETURN(-EINVAL);
1769         }
1770
1771         grouplock = fd->fd_grouplock;
1772         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1773         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1774         spin_unlock(&lli->lli_lock);
1775
1776         cl_put_grouplock(&grouplock);
1777         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1778         RETURN(0);
1779 }
1780
1781 /**
1782  * Close inode open handle
1783  *
1784  * \param dentry [in]     dentry which contains the inode
1785  * \param it     [in,out] intent which contains open info and result
1786  *
1787  * \retval 0     success
1788  * \retval <0    failure
1789  */
1790 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1791 {
1792         struct inode *inode = dentry->d_inode;
1793         struct obd_client_handle *och;
1794         int rc;
1795         ENTRY;
1796
1797         LASSERT(inode);
1798
1799         /* Root ? Do nothing. */
1800         if (dentry->d_inode->i_sb->s_root == dentry)
1801                 RETURN(0);
1802
1803         /* No open handle to close? Move away */
1804         if (!it_disposition(it, DISP_OPEN_OPEN))
1805                 RETURN(0);
1806
1807         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1808
1809         OBD_ALLOC(och, sizeof(*och));
1810         if (!och)
1811                 GOTO(out, rc = -ENOMEM);
1812
1813         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1814
1815         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1816 out:
1817         /* this one is in place of ll_file_open */
1818         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1819                 ptlrpc_req_finished(it->it_request);
1820                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1821         }
1822         RETURN(rc);
1823 }
1824
1825 /**
1826  * Get size for inode for which FIEMAP mapping is requested.
1827  * Make the FIEMAP get_info call and returns the result.
1828  * \param fiemap        kernel buffer to hold extens
1829  * \param num_bytes     kernel buffer size
1830  */
1831 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1832                         size_t num_bytes)
1833 {
1834         struct lu_env                   *env;
1835         __u16                           refcheck;
1836         int                             rc = 0;
1837         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
1838         ENTRY;
1839
1840         /* Checks for fiemap flags */
1841         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1842                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1843                 return -EBADR;
1844         }
1845
1846         /* Check for FIEMAP_FLAG_SYNC */
1847         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1848                 rc = filemap_fdatawrite(inode->i_mapping);
1849                 if (rc)
1850                         return rc;
1851         }
1852
1853         env = cl_env_get(&refcheck);
1854         if (IS_ERR(env))
1855                 RETURN(PTR_ERR(env));
1856
1857         if (i_size_read(inode) == 0) {
1858                 rc = ll_glimpse_size(inode);
1859                 if (rc)
1860                         GOTO(out, rc);
1861         }
1862
1863         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1864         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1865         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1866
1867         /* If filesize is 0, then there would be no objects for mapping */
1868         if (fmkey.lfik_oa.o_size == 0) {
1869                 fiemap->fm_mapped_extents = 0;
1870                 GOTO(out, rc = 0);
1871         }
1872
1873         fmkey.lfik_fiemap = *fiemap;
1874
1875         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1876                               &fmkey, fiemap, &num_bytes);
1877 out:
1878         cl_env_put(env, &refcheck);
1879         RETURN(rc);
1880 }
1881
1882 int ll_fid2path(struct inode *inode, void __user *arg)
1883 {
1884         struct obd_export       *exp = ll_i2mdexp(inode);
1885         const struct getinfo_fid2path __user *gfin = arg;
1886         __u32                    pathlen;
1887         struct getinfo_fid2path *gfout;
1888         size_t                   outsize;
1889         int                      rc;
1890
1891         ENTRY;
1892
1893         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1894             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1895                 RETURN(-EPERM);
1896
1897         /* Only need to get the buflen */
1898         if (get_user(pathlen, &gfin->gf_pathlen))
1899                 RETURN(-EFAULT);
1900
1901         if (pathlen > PATH_MAX)
1902                 RETURN(-EINVAL);
1903
1904         outsize = sizeof(*gfout) + pathlen;
1905         OBD_ALLOC(gfout, outsize);
1906         if (gfout == NULL)
1907                 RETURN(-ENOMEM);
1908
1909         if (copy_from_user(gfout, arg, sizeof(*gfout)))
1910                 GOTO(gf_free, rc = -EFAULT);
1911         /* append root FID after gfout to let MDT know the root FID so that it
1912          * can lookup the correct path, this is mainly for fileset.
1913          * old server without fileset mount support will ignore this. */
1914         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1915
1916         /* Call mdc_iocontrol */
1917         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1918         if (rc != 0)
1919                 GOTO(gf_free, rc);
1920
1921         if (copy_to_user(arg, gfout, outsize))
1922                 rc = -EFAULT;
1923
1924 gf_free:
1925         OBD_FREE(gfout, outsize);
1926         RETURN(rc);
1927 }
1928
1929 /*
1930  * Read the data_version for inode.
1931  *
1932  * This value is computed using stripe object version on OST.
1933  * Version is computed using server side locking.
1934  *
1935  * @param flags if do sync on the OST side;
1936  *              0: no sync
1937  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1938  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1939  */
1940 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1941 {
1942         struct cl_object *obj = ll_i2info(inode)->lli_clob;
1943         struct lu_env *env;
1944         struct cl_io *io;
1945         __u16  refcheck;
1946         int result;
1947
1948         ENTRY;
1949
1950         /* If no file object initialized, we consider its version is 0. */
1951         if (obj == NULL) {
1952                 *data_version = 0;
1953                 RETURN(0);
1954         }
1955
1956         env = cl_env_get(&refcheck);
1957         if (IS_ERR(env))
1958                 RETURN(PTR_ERR(env));
1959
1960         io = vvp_env_thread_io(env);
1961         io->ci_obj = obj;
1962         io->u.ci_data_version.dv_data_version = 0;
1963         io->u.ci_data_version.dv_flags = flags;
1964
1965 restart:
1966         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1967                 result = cl_io_loop(env, io);
1968         else
1969                 result = io->ci_result;
1970
1971         *data_version = io->u.ci_data_version.dv_data_version;
1972
1973         cl_io_fini(env, io);
1974
1975         if (unlikely(io->ci_need_restart))
1976                 goto restart;
1977
1978         cl_env_put(env, &refcheck);
1979
1980         RETURN(result);
1981 }
1982
1983 /*
1984  * Trigger a HSM release request for the provided inode.
1985  */
1986 int ll_hsm_release(struct inode *inode)
1987 {
1988         struct lu_env *env;
1989         struct obd_client_handle *och = NULL;
1990         __u64 data_version = 0;
1991         int rc;
1992         __u16 refcheck;
1993         ENTRY;
1994
1995         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1996                ll_get_fsname(inode->i_sb, NULL, 0),
1997                PFID(&ll_i2info(inode)->lli_fid));
1998
1999         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2000         if (IS_ERR(och))
2001                 GOTO(out, rc = PTR_ERR(och));
2002
2003         /* Grab latest data_version and [am]time values */
2004         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2005         if (rc != 0)
2006                 GOTO(out, rc);
2007
2008         env = cl_env_get(&refcheck);
2009         if (IS_ERR(env))
2010                 GOTO(out, rc = PTR_ERR(env));
2011
2012         ll_merge_attr(env, inode);
2013         cl_env_put(env, &refcheck);
2014
2015         /* Release the file.
2016          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2017          * we still need it to pack l_remote_handle to MDT. */
2018         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2019                                        &data_version);
2020         och = NULL;
2021
2022         EXIT;
2023 out:
2024         if (och != NULL && !IS_ERR(och)) /* close the file */
2025                 ll_lease_close(och, inode, NULL);
2026
2027         return rc;
2028 }
2029
2030 struct ll_swap_stack {
2031         __u64                    dv1;
2032         __u64                    dv2;
2033         struct inode            *inode1;
2034         struct inode            *inode2;
2035         bool                     check_dv1;
2036         bool                     check_dv2;
2037 };
2038
2039 static int ll_swap_layouts(struct file *file1, struct file *file2,
2040                            struct lustre_swap_layouts *lsl)
2041 {
2042         struct mdc_swap_layouts  msl;
2043         struct md_op_data       *op_data;
2044         __u32                    gid;
2045         __u64                    dv;
2046         struct ll_swap_stack    *llss = NULL;
2047         int                      rc;
2048
2049         OBD_ALLOC_PTR(llss);
2050         if (llss == NULL)
2051                 RETURN(-ENOMEM);
2052
2053         llss->inode1 = file_inode(file1);
2054         llss->inode2 = file_inode(file2);
2055
2056         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2057         if (rc < 0)
2058                 GOTO(free, rc);
2059
2060         /* we use 2 bool because it is easier to swap than 2 bits */
2061         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2062                 llss->check_dv1 = true;
2063
2064         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2065                 llss->check_dv2 = true;
2066
2067         /* we cannot use lsl->sl_dvX directly because we may swap them */
2068         llss->dv1 = lsl->sl_dv1;
2069         llss->dv2 = lsl->sl_dv2;
2070
2071         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2072         if (rc == 0) /* same file, done! */
2073                 GOTO(free, rc);
2074
2075         if (rc < 0) { /* sequentialize it */
2076                 swap(llss->inode1, llss->inode2);
2077                 swap(file1, file2);
2078                 swap(llss->dv1, llss->dv2);
2079                 swap(llss->check_dv1, llss->check_dv2);
2080         }
2081
2082         gid = lsl->sl_gid;
2083         if (gid != 0) { /* application asks to flush dirty cache */
2084                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2085                 if (rc < 0)
2086                         GOTO(free, rc);
2087
2088                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2089                 if (rc < 0) {
2090                         ll_put_grouplock(llss->inode1, file1, gid);
2091                         GOTO(free, rc);
2092                 }
2093         }
2094
2095         /* ultimate check, before swaping the layouts we check if
2096          * dataversion has changed (if requested) */
2097         if (llss->check_dv1) {
2098                 rc = ll_data_version(llss->inode1, &dv, 0);
2099                 if (rc)
2100                         GOTO(putgl, rc);
2101                 if (dv != llss->dv1)
2102                         GOTO(putgl, rc = -EAGAIN);
2103         }
2104
2105         if (llss->check_dv2) {
2106                 rc = ll_data_version(llss->inode2, &dv, 0);
2107                 if (rc)
2108                         GOTO(putgl, rc);
2109                 if (dv != llss->dv2)
2110                         GOTO(putgl, rc = -EAGAIN);
2111         }
2112
2113         /* struct md_op_data is used to send the swap args to the mdt
2114          * only flags is missing, so we use struct mdc_swap_layouts
2115          * through the md_op_data->op_data */
2116         /* flags from user space have to be converted before they are send to
2117          * server, no flag is sent today, they are only used on the client */
2118         msl.msl_flags = 0;
2119         rc = -ENOMEM;
2120         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2121                                      0, LUSTRE_OPC_ANY, &msl);
2122         if (IS_ERR(op_data))
2123                 GOTO(free, rc = PTR_ERR(op_data));
2124
2125         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2126                            sizeof(*op_data), op_data, NULL);
2127         ll_finish_md_op_data(op_data);
2128
2129         if (rc < 0)
2130                 GOTO(putgl, rc);
2131
2132 putgl:
2133         if (gid != 0) {
2134                 ll_put_grouplock(llss->inode2, file2, gid);
2135                 ll_put_grouplock(llss->inode1, file1, gid);
2136         }
2137
2138 free:
2139         if (llss != NULL)
2140                 OBD_FREE_PTR(llss);
2141
2142         RETURN(rc);
2143 }
2144
2145 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2146 {
2147         struct md_op_data       *op_data;
2148         int                      rc;
2149         ENTRY;
2150
2151         /* Detect out-of range masks */
2152         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2153                 RETURN(-EINVAL);
2154
2155         /* Non-root users are forbidden to set or clear flags which are
2156          * NOT defined in HSM_USER_MASK. */
2157         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2158             !cfs_capable(CFS_CAP_SYS_ADMIN))
2159                 RETURN(-EPERM);
2160
2161         /* Detect out-of range archive id */
2162         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2163             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2164                 RETURN(-EINVAL);
2165
2166         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2167                                      LUSTRE_OPC_ANY, hss);
2168         if (IS_ERR(op_data))
2169                 RETURN(PTR_ERR(op_data));
2170
2171         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2172                            sizeof(*op_data), op_data, NULL);
2173
2174         ll_finish_md_op_data(op_data);
2175
2176         RETURN(rc);
2177 }
2178
2179 static int ll_hsm_import(struct inode *inode, struct file *file,
2180                          struct hsm_user_import *hui)
2181 {
2182         struct hsm_state_set    *hss = NULL;
2183         struct iattr            *attr = NULL;
2184         int                      rc;
2185         ENTRY;
2186
2187         if (!S_ISREG(inode->i_mode))
2188                 RETURN(-EINVAL);
2189
2190         /* set HSM flags */
2191         OBD_ALLOC_PTR(hss);
2192         if (hss == NULL)
2193                 GOTO(out, rc = -ENOMEM);
2194
2195         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2196         hss->hss_archive_id = hui->hui_archive_id;
2197         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2198         rc = ll_hsm_state_set(inode, hss);
2199         if (rc != 0)
2200                 GOTO(out, rc);
2201
2202         OBD_ALLOC_PTR(attr);
2203         if (attr == NULL)
2204                 GOTO(out, rc = -ENOMEM);
2205
2206         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2207         attr->ia_mode |= S_IFREG;
2208         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2209         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2210         attr->ia_size = hui->hui_size;
2211         attr->ia_mtime.tv_sec = hui->hui_mtime;
2212         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2213         attr->ia_atime.tv_sec = hui->hui_atime;
2214         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2215
2216         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2217                          ATTR_UID | ATTR_GID |
2218                          ATTR_MTIME | ATTR_MTIME_SET |
2219                          ATTR_ATIME | ATTR_ATIME_SET;
2220
2221         inode_lock(inode);
2222
2223         rc = ll_setattr_raw(file_dentry(file), attr, true);
2224         if (rc == -ENODATA)
2225                 rc = 0;
2226
2227         inode_unlock(inode);
2228
2229 out:
2230         if (hss != NULL)
2231                 OBD_FREE_PTR(hss);
2232
2233         if (attr != NULL)
2234                 OBD_FREE_PTR(attr);
2235
2236         RETURN(rc);
2237 }
2238
2239 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2240 {
2241         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2242                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2243 }
2244
2245 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2246 {
2247         struct inode *inode = file_inode(file);
2248         struct iattr ia = {
2249                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2250                             ATTR_MTIME | ATTR_MTIME_SET |
2251                             ATTR_CTIME | ATTR_CTIME_SET,
2252                 .ia_atime = {
2253                         .tv_sec = lfu->lfu_atime_sec,
2254                         .tv_nsec = lfu->lfu_atime_nsec,
2255                 },
2256                 .ia_mtime = {
2257                         .tv_sec = lfu->lfu_mtime_sec,
2258                         .tv_nsec = lfu->lfu_mtime_nsec,
2259                 },
2260                 .ia_ctime = {
2261                         .tv_sec = lfu->lfu_ctime_sec,
2262                         .tv_nsec = lfu->lfu_ctime_nsec,
2263                 },
2264         };
2265         int rc;
2266         ENTRY;
2267
2268         if (!capable(CAP_SYS_ADMIN))
2269                 RETURN(-EPERM);
2270
2271         if (!S_ISREG(inode->i_mode))
2272                 RETURN(-EINVAL);
2273
2274         inode_lock(inode);
2275         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2276         inode_unlock(inode);
2277
2278         RETURN(rc);
2279 }
2280
2281 /*
2282  * Give file access advices
2283  *
2284  * The ladvise interface is similar to Linux fadvise() system call, except it
2285  * forwards the advices directly from Lustre client to server. The server side
2286  * codes will apply appropriate read-ahead and caching techniques for the
2287  * corresponding files.
2288  *
2289  * A typical workload for ladvise is e.g. a bunch of different clients are
2290  * doing small random reads of a file, so prefetching pages into OSS cache
2291  * with big linear reads before the random IO is a net benefit. Fetching
2292  * all that data into each client cache with fadvise() may not be, due to
2293  * much more data being sent to the client.
2294  */
2295 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2296                       struct lu_ladvise *ladvise)
2297 {
2298         struct lu_env *env;
2299         struct cl_io *io;
2300         struct cl_ladvise_io *lio;
2301         int rc;
2302         __u16 refcheck;
2303         ENTRY;
2304
2305         env = cl_env_get(&refcheck);
2306         if (IS_ERR(env))
2307                 RETURN(PTR_ERR(env));
2308
2309         io = vvp_env_thread_io(env);
2310         io->ci_obj = ll_i2info(inode)->lli_clob;
2311
2312         /* initialize parameters for ladvise */
2313         lio = &io->u.ci_ladvise;
2314         lio->li_start = ladvise->lla_start;
2315         lio->li_end = ladvise->lla_end;
2316         lio->li_fid = ll_inode2fid(inode);
2317         lio->li_advice = ladvise->lla_advice;
2318         lio->li_flags = flags;
2319
2320         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2321                 rc = cl_io_loop(env, io);
2322         else
2323                 rc = io->ci_result;
2324
2325         cl_io_fini(env, io);
2326         cl_env_put(env, &refcheck);
2327         RETURN(rc);
2328 }
2329
2330 static long
2331 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2332 {
2333         struct inode            *inode = file_inode(file);
2334         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2335         int                      flags, rc;
2336         ENTRY;
2337
2338         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2339                PFID(ll_inode2fid(inode)), inode, cmd);
2340         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2341
2342         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2343         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2344                 RETURN(-ENOTTY);
2345
2346         switch(cmd) {
2347         case LL_IOC_GETFLAGS:
2348                 /* Get the current value of the file flags */
2349                 return put_user(fd->fd_flags, (int __user *)arg);
2350         case LL_IOC_SETFLAGS:
2351         case LL_IOC_CLRFLAGS:
2352                 /* Set or clear specific file flags */
2353                 /* XXX This probably needs checks to ensure the flags are
2354                  *     not abused, and to handle any flag side effects.
2355                  */
2356                 if (get_user(flags, (int __user *) arg))
2357                         RETURN(-EFAULT);
2358
2359                 if (cmd == LL_IOC_SETFLAGS) {
2360                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2361                             !(file->f_flags & O_DIRECT)) {
2362                                 CERROR("%s: unable to disable locking on "
2363                                        "non-O_DIRECT file\n", current->comm);
2364                                 RETURN(-EINVAL);
2365                         }
2366
2367                         fd->fd_flags |= flags;
2368                 } else {
2369                         fd->fd_flags &= ~flags;
2370                 }
2371                 RETURN(0);
2372         case LL_IOC_LOV_SETSTRIPE:
2373                 RETURN(ll_lov_setstripe(inode, file, arg));
2374         case LL_IOC_LOV_SETEA:
2375                 RETURN(ll_lov_setea(inode, file, arg));
2376         case LL_IOC_LOV_SWAP_LAYOUTS: {
2377                 struct file *file2;
2378                 struct lustre_swap_layouts lsl;
2379
2380                 if (copy_from_user(&lsl, (char __user *)arg,
2381                                        sizeof(struct lustre_swap_layouts)))
2382                         RETURN(-EFAULT);
2383
2384                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2385                         RETURN(-EPERM);
2386
2387                 file2 = fget(lsl.sl_fd);
2388                 if (file2 == NULL)
2389                         RETURN(-EBADF);
2390
2391                 /* O_WRONLY or O_RDWR */
2392                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2393                         GOTO(out, rc = -EPERM);
2394
2395                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2396                         struct inode                    *inode2;
2397                         struct ll_inode_info            *lli;
2398                         struct obd_client_handle        *och = NULL;
2399
2400                         if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2401                                 GOTO(out, rc = -EINVAL);
2402
2403                         lli = ll_i2info(inode);
2404                         mutex_lock(&lli->lli_och_mutex);
2405                         if (fd->fd_lease_och != NULL) {
2406                                 och = fd->fd_lease_och;
2407                                 fd->fd_lease_och = NULL;
2408                         }
2409                         mutex_unlock(&lli->lli_och_mutex);
2410                         if (och == NULL)
2411                                 GOTO(out, rc = -ENOLCK);
2412                         inode2 = file_inode(file2);
2413                         rc = ll_swap_layouts_close(och, inode, inode2);
2414                 } else {
2415                         rc = ll_swap_layouts(file, file2, &lsl);
2416                 }
2417 out:
2418                 fput(file2);
2419                 RETURN(rc);
2420         }
2421         case LL_IOC_LOV_GETSTRIPE:
2422                 RETURN(ll_file_getstripe(inode,
2423                                          (struct lov_user_md __user *)arg));
2424         case FSFILT_IOC_GETFLAGS:
2425         case FSFILT_IOC_SETFLAGS:
2426                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2427         case FSFILT_IOC_GETVERSION_OLD:
2428         case FSFILT_IOC_GETVERSION:
2429                 RETURN(put_user(inode->i_generation, (int __user *)arg));
2430         case LL_IOC_GROUP_LOCK:
2431                 RETURN(ll_get_grouplock(inode, file, arg));
2432         case LL_IOC_GROUP_UNLOCK:
2433                 RETURN(ll_put_grouplock(inode, file, arg));
2434         case IOC_OBD_STATFS:
2435                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2436
2437         /* We need to special case any other ioctls we want to handle,
2438          * to send them to the MDS/OST as appropriate and to properly
2439          * network encode the arg field.
2440         case FSFILT_IOC_SETVERSION_OLD:
2441         case FSFILT_IOC_SETVERSION:
2442         */
2443         case LL_IOC_FLUSHCTX:
2444                 RETURN(ll_flush_ctx(inode));
2445         case LL_IOC_PATH2FID: {
2446                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2447                                  sizeof(struct lu_fid)))
2448                         RETURN(-EFAULT);
2449
2450                 RETURN(0);
2451         }
2452         case LL_IOC_GETPARENT:
2453                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2454
2455         case OBD_IOC_FID2PATH:
2456                 RETURN(ll_fid2path(inode, (void __user *)arg));
2457         case LL_IOC_DATA_VERSION: {
2458                 struct ioc_data_version idv;
2459                 int rc;
2460
2461                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2462                         RETURN(-EFAULT);
2463
2464                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2465                 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2466
2467                 if (rc == 0 &&
2468                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2469                         RETURN(-EFAULT);
2470
2471                 RETURN(rc);
2472         }
2473
2474         case LL_IOC_GET_MDTIDX: {
2475                 int mdtidx;
2476
2477                 mdtidx = ll_get_mdt_idx(inode);
2478                 if (mdtidx < 0)
2479                         RETURN(mdtidx);
2480
2481                 if (put_user((int)mdtidx, (int __user *)arg))
2482                         RETURN(-EFAULT);
2483
2484                 RETURN(0);
2485         }
2486         case OBD_IOC_GETDTNAME:
2487         case OBD_IOC_GETMDNAME:
2488                 RETURN(ll_get_obd_name(inode, cmd, arg));
2489         case LL_IOC_HSM_STATE_GET: {
2490                 struct md_op_data       *op_data;
2491                 struct hsm_user_state   *hus;
2492                 int                      rc;
2493
2494                 OBD_ALLOC_PTR(hus);
2495                 if (hus == NULL)
2496                         RETURN(-ENOMEM);
2497
2498                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2499                                              LUSTRE_OPC_ANY, hus);
2500                 if (IS_ERR(op_data)) {
2501                         OBD_FREE_PTR(hus);
2502                         RETURN(PTR_ERR(op_data));
2503                 }
2504
2505                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2506                                    op_data, NULL);
2507
2508                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2509                         rc = -EFAULT;
2510
2511                 ll_finish_md_op_data(op_data);
2512                 OBD_FREE_PTR(hus);
2513                 RETURN(rc);
2514         }
2515         case LL_IOC_HSM_STATE_SET: {
2516                 struct hsm_state_set    *hss;
2517                 int                      rc;
2518
2519                 OBD_ALLOC_PTR(hss);
2520                 if (hss == NULL)
2521                         RETURN(-ENOMEM);
2522
2523                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2524                         OBD_FREE_PTR(hss);
2525                         RETURN(-EFAULT);
2526                 }
2527
2528                 rc = ll_hsm_state_set(inode, hss);
2529
2530                 OBD_FREE_PTR(hss);
2531                 RETURN(rc);
2532         }
2533         case LL_IOC_HSM_ACTION: {
2534                 struct md_op_data               *op_data;
2535                 struct hsm_current_action       *hca;
2536                 int                              rc;
2537
2538                 OBD_ALLOC_PTR(hca);
2539                 if (hca == NULL)
2540                         RETURN(-ENOMEM);
2541
2542                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2543                                              LUSTRE_OPC_ANY, hca);
2544                 if (IS_ERR(op_data)) {
2545                         OBD_FREE_PTR(hca);
2546                         RETURN(PTR_ERR(op_data));
2547                 }
2548
2549                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2550                                    op_data, NULL);
2551
2552                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2553                         rc = -EFAULT;
2554
2555                 ll_finish_md_op_data(op_data);
2556                 OBD_FREE_PTR(hca);
2557                 RETURN(rc);
2558         }
2559         case LL_IOC_SET_LEASE: {
2560                 struct ll_inode_info *lli = ll_i2info(inode);
2561                 struct obd_client_handle *och = NULL;
2562                 bool lease_broken;
2563                 fmode_t fmode;
2564
2565                 switch (arg) {
2566                 case LL_LEASE_WRLCK:
2567                         if (!(file->f_mode & FMODE_WRITE))
2568                                 RETURN(-EPERM);
2569                         fmode = FMODE_WRITE;
2570                         break;
2571                 case LL_LEASE_RDLCK:
2572                         if (!(file->f_mode & FMODE_READ))
2573                                 RETURN(-EPERM);
2574                         fmode = FMODE_READ;
2575                         break;
2576                 case LL_LEASE_UNLCK:
2577                         mutex_lock(&lli->lli_och_mutex);
2578                         if (fd->fd_lease_och != NULL) {
2579                                 och = fd->fd_lease_och;
2580                                 fd->fd_lease_och = NULL;
2581                         }
2582                         mutex_unlock(&lli->lli_och_mutex);
2583
2584                         if (och == NULL)
2585                                 RETURN(-ENOLCK);
2586
2587                         fmode = och->och_flags;
2588                         rc = ll_lease_close(och, inode, &lease_broken);
2589                         if (rc < 0)
2590                                 RETURN(rc);
2591
2592                         rc = ll_lease_och_release(inode, file);
2593                         if (rc < 0)
2594                                 RETURN(rc);
2595
2596                         if (lease_broken)
2597                                 fmode = 0;
2598
2599                         RETURN(ll_lease_type_from_fmode(fmode));
2600                 default:
2601                         RETURN(-EINVAL);
2602                 }
2603
2604                 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2605
2606                 /* apply for lease */
2607                 och = ll_lease_open(inode, file, fmode, 0);
2608                 if (IS_ERR(och))
2609                         RETURN(PTR_ERR(och));
2610
2611                 rc = 0;
2612                 mutex_lock(&lli->lli_och_mutex);
2613                 if (fd->fd_lease_och == NULL) {
2614                         fd->fd_lease_och = och;
2615                         och = NULL;
2616                 }
2617                 mutex_unlock(&lli->lli_och_mutex);
2618                 if (och != NULL) {
2619                         /* impossible now that only excl is supported for now */
2620                         ll_lease_close(och, inode, &lease_broken);
2621                         rc = -EBUSY;
2622                 }
2623                 RETURN(rc);
2624         }
2625         case LL_IOC_GET_LEASE: {
2626                 struct ll_inode_info *lli = ll_i2info(inode);
2627                 struct ldlm_lock *lock = NULL;
2628                 fmode_t fmode = 0;
2629
2630                 mutex_lock(&lli->lli_och_mutex);
2631                 if (fd->fd_lease_och != NULL) {
2632                         struct obd_client_handle *och = fd->fd_lease_och;
2633
2634                         lock = ldlm_handle2lock(&och->och_lease_handle);
2635                         if (lock != NULL) {
2636                                 lock_res_and_lock(lock);
2637                                 if (!ldlm_is_cancel(lock))
2638                                         fmode = och->och_flags;
2639
2640                                 unlock_res_and_lock(lock);
2641                                 LDLM_LOCK_PUT(lock);
2642                         }
2643                 }
2644                 mutex_unlock(&lli->lli_och_mutex);
2645
2646                 RETURN(ll_lease_type_from_fmode(fmode));
2647         }
2648         case LL_IOC_HSM_IMPORT: {
2649                 struct hsm_user_import *hui;
2650
2651                 OBD_ALLOC_PTR(hui);
2652                 if (hui == NULL)
2653                         RETURN(-ENOMEM);
2654
2655                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2656                         OBD_FREE_PTR(hui);
2657                         RETURN(-EFAULT);
2658                 }
2659
2660                 rc = ll_hsm_import(inode, file, hui);
2661
2662                 OBD_FREE_PTR(hui);
2663                 RETURN(rc);
2664         }
2665         case LL_IOC_FUTIMES_3: {
2666                 struct ll_futimes_3 lfu;
2667
2668                 if (copy_from_user(&lfu,
2669                                    (const struct ll_futimes_3 __user *)arg,
2670                                    sizeof(lfu)))
2671                         RETURN(-EFAULT);
2672
2673                 RETURN(ll_file_futimes_3(file, &lfu));
2674         }
2675         case LL_IOC_LADVISE: {
2676                 struct ladvise_hdr *ladvise_hdr;
2677                 int i;
2678                 int num_advise;
2679                 int alloc_size = sizeof(*ladvise_hdr);
2680
2681                 rc = 0;
2682                 OBD_ALLOC_PTR(ladvise_hdr);
2683                 if (ladvise_hdr == NULL)
2684                         RETURN(-ENOMEM);
2685
2686                 if (copy_from_user(ladvise_hdr,
2687                                    (const struct ladvise_hdr __user *)arg,
2688                                    alloc_size))
2689                         GOTO(out_ladvise, rc = -EFAULT);
2690
2691                 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2692                     ladvise_hdr->lah_count < 1)
2693                         GOTO(out_ladvise, rc = -EINVAL);
2694
2695                 num_advise = ladvise_hdr->lah_count;
2696                 if (num_advise >= LAH_COUNT_MAX)
2697                         GOTO(out_ladvise, rc = -EFBIG);
2698
2699                 OBD_FREE_PTR(ladvise_hdr);
2700                 alloc_size = offsetof(typeof(*ladvise_hdr),
2701                                       lah_advise[num_advise]);
2702                 OBD_ALLOC(ladvise_hdr, alloc_size);
2703                 if (ladvise_hdr == NULL)
2704                         RETURN(-ENOMEM);
2705
2706                 /*
2707                  * TODO: submit multiple advices to one server in a single RPC
2708                  */
2709                 if (copy_from_user(ladvise_hdr,
2710                                    (const struct ladvise_hdr __user *)arg,
2711                                    alloc_size))
2712                         GOTO(out_ladvise, rc = -EFAULT);
2713
2714                 for (i = 0; i < num_advise; i++) {
2715                         rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2716                                         &ladvise_hdr->lah_advise[i]);
2717                         if (rc)
2718                                 break;
2719                 }
2720
2721 out_ladvise:
2722                 OBD_FREE(ladvise_hdr, alloc_size);
2723                 RETURN(rc);
2724         }
2725         default: {
2726                 int err;
2727
2728                 if (LLIOC_STOP ==
2729                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2730                         RETURN(err);
2731
2732                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2733                                      (void __user *)arg));
2734         }
2735         }
2736 }
2737
2738 #ifndef HAVE_FILE_LLSEEK_SIZE
2739 static inline loff_t
2740 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2741 {
2742         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2743                 return -EINVAL;
2744         if (offset > maxsize)
2745                 return -EINVAL;
2746
2747         if (offset != file->f_pos) {
2748                 file->f_pos = offset;
2749                 file->f_version = 0;
2750         }
2751         return offset;
2752 }
2753
2754 static loff_t
2755 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2756                 loff_t maxsize, loff_t eof)
2757 {
2758         struct inode *inode = file_inode(file);
2759
2760         switch (origin) {
2761         case SEEK_END:
2762                 offset += eof;
2763                 break;
2764         case SEEK_CUR:
2765                 /*
2766                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2767                  * position-querying operation.  Avoid rewriting the "same"
2768                  * f_pos value back to the file because a concurrent read(),
2769                  * write() or lseek() might have altered it
2770                  */
2771                 if (offset == 0)
2772                         return file->f_pos;
2773                 /*
2774                  * f_lock protects against read/modify/write race with other
2775                  * SEEK_CURs. Note that parallel writes and reads behave
2776                  * like SEEK_SET.
2777                  */
2778                 inode_lock(inode);
2779                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2780                 inode_unlock(inode);
2781                 return offset;
2782         case SEEK_DATA:
2783                 /*
2784                  * In the generic case the entire file is data, so as long as
2785                  * offset isn't at the end of the file then the offset is data.
2786                  */
2787                 if (offset >= eof)
2788                         return -ENXIO;
2789                 break;
2790         case SEEK_HOLE:
2791                 /*
2792                  * There is a virtual hole at the end of the file, so as long as
2793                  * offset isn't i_size or larger, return i_size.
2794                  */
2795                 if (offset >= eof)
2796                         return -ENXIO;
2797                 offset = eof;
2798                 break;
2799         }
2800
2801         return llseek_execute(file, offset, maxsize);
2802 }
2803 #endif
2804
2805 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2806 {
2807         struct inode *inode = file_inode(file);
2808         loff_t retval, eof = 0;
2809
2810         ENTRY;
2811         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2812                            (origin == SEEK_CUR) ? file->f_pos : 0);
2813         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2814                PFID(ll_inode2fid(inode)), inode, retval, retval,
2815                origin);
2816         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2817
2818         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2819                 retval = ll_glimpse_size(inode);
2820                 if (retval != 0)
2821                         RETURN(retval);
2822                 eof = i_size_read(inode);
2823         }
2824
2825         retval = ll_generic_file_llseek_size(file, offset, origin,
2826                                           ll_file_maxbytes(inode), eof);
2827         RETURN(retval);
2828 }
2829
2830 static int ll_flush(struct file *file, fl_owner_t id)
2831 {
2832         struct inode *inode = file_inode(file);
2833         struct ll_inode_info *lli = ll_i2info(inode);
2834         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2835         int rc, err;
2836
2837         LASSERT(!S_ISDIR(inode->i_mode));
2838
2839         /* catch async errors that were recorded back when async writeback
2840          * failed for pages in this mapping. */
2841         rc = lli->lli_async_rc;
2842         lli->lli_async_rc = 0;
2843         if (lli->lli_clob != NULL) {
2844                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2845                 if (rc == 0)
2846                         rc = err;
2847         }
2848
2849         /* The application has been told write failure already.
2850          * Do not report failure again. */
2851         if (fd->fd_write_failed)
2852                 return 0;
2853         return rc ? -EIO : 0;
2854 }
2855
2856 /**
2857  * Called to make sure a portion of file has been written out.
2858  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2859  *
2860  * Return how many pages have been written.
2861  */
2862 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2863                        enum cl_fsync_mode mode, int ignore_layout)
2864 {
2865         struct lu_env *env;
2866         struct cl_io *io;
2867         struct cl_fsync_io *fio;
2868         int result;
2869         __u16 refcheck;
2870         ENTRY;
2871
2872         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2873             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2874                 RETURN(-EINVAL);
2875
2876         env = cl_env_get(&refcheck);
2877         if (IS_ERR(env))
2878                 RETURN(PTR_ERR(env));
2879
2880         io = vvp_env_thread_io(env);
2881         io->ci_obj = ll_i2info(inode)->lli_clob;
2882         io->ci_ignore_layout = ignore_layout;
2883
2884         /* initialize parameters for sync */
2885         fio = &io->u.ci_fsync;
2886         fio->fi_start = start;
2887         fio->fi_end = end;
2888         fio->fi_fid = ll_inode2fid(inode);
2889         fio->fi_mode = mode;
2890         fio->fi_nr_written = 0;
2891
2892         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2893                 result = cl_io_loop(env, io);
2894         else
2895                 result = io->ci_result;
2896         if (result == 0)
2897                 result = fio->fi_nr_written;
2898         cl_io_fini(env, io);
2899         cl_env_put(env, &refcheck);
2900
2901         RETURN(result);
2902 }
2903
2904 /*
2905  * When dentry is provided (the 'else' case), file_dentry() may be
2906  * null and dentry must be used directly rather than pulled from
2907  * file_dentry() as is done otherwise.
2908  */
2909
2910 #ifdef HAVE_FILE_FSYNC_4ARGS
2911 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2912 {
2913         struct dentry *dentry = file_dentry(file);
2914 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2915 int ll_fsync(struct file *file, int datasync)
2916 {
2917         struct dentry *dentry = file_dentry(file);
2918         loff_t start = 0;
2919         loff_t end = LLONG_MAX;
2920 #else
2921 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2922 {
2923         loff_t start = 0;
2924         loff_t end = LLONG_MAX;
2925 #endif
2926         struct inode *inode = dentry->d_inode;
2927         struct ll_inode_info *lli = ll_i2info(inode);
2928         struct ptlrpc_request *req;
2929         int rc, err;
2930         ENTRY;
2931
2932         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2933                PFID(ll_inode2fid(inode)), inode);
2934         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2935
2936 #ifdef HAVE_FILE_FSYNC_4ARGS
2937         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2938         inode_lock(inode);
2939 #else
2940         /* fsync's caller has already called _fdata{sync,write}, we want
2941          * that IO to finish before calling the osc and mdc sync methods */
2942         rc = filemap_fdatawait(inode->i_mapping);
2943 #endif
2944
2945         /* catch async errors that were recorded back when async writeback
2946          * failed for pages in this mapping. */
2947         if (!S_ISDIR(inode->i_mode)) {
2948                 err = lli->lli_async_rc;
2949                 lli->lli_async_rc = 0;
2950                 if (rc == 0)
2951                         rc = err;
2952                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2953                 if (rc == 0)
2954                         rc = err;
2955         }
2956
2957         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2958         if (!rc)
2959                 rc = err;
2960         if (!err)
2961                 ptlrpc_req_finished(req);
2962
2963         if (S_ISREG(inode->i_mode)) {
2964                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2965
2966                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2967                 if (rc == 0 && err < 0)
2968                         rc = err;
2969                 if (rc < 0)
2970                         fd->fd_write_failed = true;
2971                 else
2972                         fd->fd_write_failed = false;
2973         }
2974
2975 #ifdef HAVE_FILE_FSYNC_4ARGS
2976         inode_unlock(inode);
2977 #endif
2978         RETURN(rc);
2979 }
2980
2981 static int
2982 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2983 {
2984         struct inode *inode = file_inode(file);
2985         struct ll_sb_info *sbi = ll_i2sbi(inode);
2986         struct ldlm_enqueue_info einfo = {
2987                 .ei_type        = LDLM_FLOCK,
2988                 .ei_cb_cp       = ldlm_flock_completion_ast,
2989                 .ei_cbdata      = file_lock,
2990         };
2991         struct md_op_data *op_data;
2992         struct lustre_handle lockh = { 0 };
2993         union ldlm_policy_data flock = { { 0 } };
2994         int fl_type = file_lock->fl_type;
2995         __u64 flags = 0;
2996         int rc;
2997         int rc2 = 0;
2998         ENTRY;
2999
3000         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3001                PFID(ll_inode2fid(inode)), file_lock);
3002
3003         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3004
3005         if (file_lock->fl_flags & FL_FLOCK) {
3006                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3007                 /* flocks are whole-file locks */
3008                 flock.l_flock.end = OFFSET_MAX;
3009                 /* For flocks owner is determined by the local file desctiptor*/
3010                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3011         } else if (file_lock->fl_flags & FL_POSIX) {
3012                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3013                 flock.l_flock.start = file_lock->fl_start;
3014                 flock.l_flock.end = file_lock->fl_end;
3015         } else {
3016                 RETURN(-EINVAL);
3017         }
3018         flock.l_flock.pid = file_lock->fl_pid;
3019
3020         /* Somewhat ugly workaround for svc lockd.
3021          * lockd installs custom fl_lmops->lm_compare_owner that checks
3022          * for the fl_owner to be the same (which it always is on local node
3023          * I guess between lockd processes) and then compares pid.
3024          * As such we assign pid to the owner field to make it all work,
3025          * conflict with normal locks is unlikely since pid space and
3026          * pointer space for current->files are not intersecting */
3027         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3028                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3029
3030         switch (fl_type) {
3031         case F_RDLCK:
3032                 einfo.ei_mode = LCK_PR;
3033                 break;
3034         case F_UNLCK:
3035                 /* An unlock request may or may not have any relation to
3036                  * existing locks so we may not be able to pass a lock handle
3037                  * via a normal ldlm_lock_cancel() request. The request may even
3038                  * unlock a byte range in the middle of an existing lock. In
3039                  * order to process an unlock request we need all of the same
3040                  * information that is given with a normal read or write record
3041                  * lock request. To avoid creating another ldlm unlock (cancel)
3042                  * message we'll treat a LCK_NL flock request as an unlock. */
3043                 einfo.ei_mode = LCK_NL;
3044                 break;
3045         case F_WRLCK:
3046                 einfo.ei_mode = LCK_PW;
3047                 break;
3048         default:
3049                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3050                 RETURN (-ENOTSUPP);
3051         }
3052
3053         switch (cmd) {
3054         case F_SETLKW:
3055 #ifdef F_SETLKW64
3056         case F_SETLKW64:
3057 #endif
3058                 flags = 0;
3059                 break;
3060         case F_SETLK:
3061 #ifdef F_SETLK64
3062         case F_SETLK64:
3063 #endif
3064                 flags = LDLM_FL_BLOCK_NOWAIT;
3065                 break;
3066         case F_GETLK:
3067 #ifdef F_GETLK64
3068         case F_GETLK64:
3069 #endif
3070                 flags = LDLM_FL_TEST_LOCK;
3071                 break;
3072         default:
3073                 CERROR("unknown fcntl lock command: %d\n", cmd);
3074                 RETURN (-EINVAL);
3075         }
3076
3077         /* Save the old mode so that if the mode in the lock changes we
3078          * can decrement the appropriate reader or writer refcount. */
3079         file_lock->fl_type = einfo.ei_mode;
3080
3081         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3082                                      LUSTRE_OPC_ANY, NULL);
3083         if (IS_ERR(op_data))
3084                 RETURN(PTR_ERR(op_data));
3085
3086         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3087                "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3088                flock.l_flock.pid, flags, einfo.ei_mode,
3089                flock.l_flock.start, flock.l_flock.end);
3090
3091         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3092                         flags);
3093
3094         /* Restore the file lock type if not TEST lock. */
3095         if (!(flags & LDLM_FL_TEST_LOCK))
3096                 file_lock->fl_type = fl_type;
3097
3098 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3099         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3100             !(flags & LDLM_FL_TEST_LOCK))
3101                 rc2  = locks_lock_file_wait(file, file_lock);
3102 #else
3103         if ((file_lock->fl_flags & FL_FLOCK) &&
3104             (rc == 0 || file_lock->fl_type == F_UNLCK))
3105                 rc2  = flock_lock_file_wait(file, file_lock);
3106         if ((file_lock->fl_flags & FL_POSIX) &&
3107             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3108             !(flags & LDLM_FL_TEST_LOCK))
3109                 rc2  = posix_lock_file_wait(file, file_lock);
3110 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3111
3112         if (rc2 && file_lock->fl_type != F_UNLCK) {
3113                 einfo.ei_mode = LCK_NL;
3114                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3115                            &lockh, flags);
3116                 rc = rc2;
3117         }
3118
3119         ll_finish_md_op_data(op_data);
3120
3121         RETURN(rc);
3122 }
3123
3124 int ll_get_fid_by_name(struct inode *parent, const char *name,
3125                        int namelen, struct lu_fid *fid,
3126                        struct inode **inode)
3127 {
3128         struct md_op_data       *op_data = NULL;
3129         struct mdt_body         *body;
3130         struct ptlrpc_request   *req;
3131         int                     rc;
3132         ENTRY;
3133
3134         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3135                                      LUSTRE_OPC_ANY, NULL);
3136         if (IS_ERR(op_data))
3137                 RETURN(PTR_ERR(op_data));
3138
3139         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3140         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3141         ll_finish_md_op_data(op_data);
3142         if (rc < 0)
3143                 RETURN(rc);
3144
3145         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3146         if (body == NULL)
3147                 GOTO(out_req, rc = -EFAULT);
3148         if (fid != NULL)
3149                 *fid = body->mbo_fid1;
3150
3151         if (inode != NULL)
3152                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3153 out_req:
3154         ptlrpc_req_finished(req);
3155         RETURN(rc);
3156 }
3157
3158 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3159                const char *name, int namelen)
3160 {
3161         struct dentry         *dchild = NULL;
3162         struct inode          *child_inode = NULL;
3163         struct md_op_data     *op_data;
3164         struct ptlrpc_request *request = NULL;
3165         struct obd_client_handle *och = NULL;
3166         struct qstr           qstr;
3167         struct mdt_body         *body;
3168         int                    rc;
3169         __u64                   data_version = 0;
3170         ENTRY;
3171
3172         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3173                name, PFID(ll_inode2fid(parent)), mdtidx);
3174
3175         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3176                                      0, LUSTRE_OPC_ANY, NULL);
3177         if (IS_ERR(op_data))
3178                 RETURN(PTR_ERR(op_data));
3179
3180         /* Get child FID first */
3181         qstr.hash = full_name_hash(name, namelen);
3182         qstr.name = name;
3183         qstr.len = namelen;
3184         dchild = d_lookup(file_dentry(file), &qstr);
3185         if (dchild != NULL) {
3186                 if (dchild->d_inode != NULL)
3187                         child_inode = igrab(dchild->d_inode);
3188                 dput(dchild);
3189         }
3190
3191         if (child_inode == NULL) {
3192                 rc = ll_get_fid_by_name(parent, name, namelen,
3193                                         &op_data->op_fid3, &child_inode);
3194                 if (rc != 0)
3195                         GOTO(out_free, rc);
3196         }
3197
3198         if (child_inode == NULL)
3199                 GOTO(out_free, rc = -EINVAL);
3200
3201         /*
3202          * lfs migrate command needs to be blocked on the client
3203          * by checking the migrate FID against the FID of the
3204          * filesystem root.
3205          */
3206         if (child_inode == parent->i_sb->s_root->d_inode)
3207                 GOTO(out_iput, rc = -EINVAL);
3208
3209         inode_lock(child_inode);
3210         op_data->op_fid3 = *ll_inode2fid(child_inode);
3211         if (!fid_is_sane(&op_data->op_fid3)) {
3212                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3213                        ll_get_fsname(parent->i_sb, NULL, 0), name,
3214                        PFID(&op_data->op_fid3));
3215                 GOTO(out_unlock, rc = -EINVAL);
3216         }
3217
3218         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3219         if (rc < 0)
3220                 GOTO(out_unlock, rc);
3221
3222         if (rc == mdtidx) {
3223                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3224                        PFID(&op_data->op_fid3), mdtidx);
3225                 GOTO(out_unlock, rc = 0);
3226         }
3227 again:
3228         if (S_ISREG(child_inode->i_mode)) {
3229                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3230                 if (IS_ERR(och)) {
3231                         rc = PTR_ERR(och);
3232                         och = NULL;
3233                         GOTO(out_unlock, rc);
3234                 }
3235
3236                 rc = ll_data_version(child_inode, &data_version,
3237                                      LL_DV_WR_FLUSH);
3238                 if (rc != 0)
3239                         GOTO(out_close, rc);
3240
3241                 op_data->op_handle = och->och_fh;
3242                 op_data->op_data = och->och_mod;
3243                 op_data->op_data_version = data_version;
3244                 op_data->op_lease_handle = och->och_lease_handle;
3245                 op_data->op_bias |= MDS_RENAME_MIGRATE;
3246         }
3247
3248         op_data->op_mds = mdtidx;
3249         op_data->op_cli_flags = CLI_MIGRATE;
3250         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3251                        namelen, name, namelen, &request);
3252         if (rc == 0)
3253                 ll_update_times(request, parent);
3254
3255         if (request != NULL) {
3256                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3257                 if (body == NULL) {
3258                         ptlrpc_req_finished(request);
3259                         GOTO(out_close, rc = -EPROTO);
3260                 }
3261
3262                 /* If the server does release layout lock, then we cleanup
3263                  * the client och here, otherwise release it in out_close: */
3264                 if (och != NULL &&
3265                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3266                         obd_mod_put(och->och_mod);
3267                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3268                                                   och);
3269                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3270                         OBD_FREE_PTR(och);
3271                         och = NULL;
3272                 }
3273                 ptlrpc_req_finished(request);
3274         }
3275
3276         /* Try again if the file layout has changed. */
3277         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3278                 request = NULL;
3279                 goto again;
3280         }
3281 out_close:
3282         if (och != NULL) /* close the file */
3283                 ll_lease_close(och, child_inode, NULL);
3284         if (rc == 0)
3285                 clear_nlink(child_inode);
3286 out_unlock:
3287         inode_unlock(child_inode);
3288 out_iput:
3289         iput(child_inode);
3290 out_free:
3291         ll_finish_md_op_data(op_data);
3292         RETURN(rc);
3293 }
3294
3295 static int
3296 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3297 {
3298         ENTRY;
3299
3300         RETURN(-ENOSYS);
3301 }
3302
3303 /**
3304  * test if some locks matching bits and l_req_mode are acquired
3305  * - bits can be in different locks
3306  * - if found clear the common lock bits in *bits
3307  * - the bits not found, are kept in *bits
3308  * \param inode [IN]
3309  * \param bits [IN] searched lock bits [IN]
3310  * \param l_req_mode [IN] searched lock mode
3311  * \retval boolean, true iff all bits are found
3312  */
3313 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3314 {
3315         struct lustre_handle lockh;
3316         union ldlm_policy_data policy;
3317         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3318                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3319         struct lu_fid *fid;
3320         __u64 flags;
3321         int i;
3322         ENTRY;
3323
3324         if (!inode)
3325                RETURN(0);
3326
3327         fid = &ll_i2info(inode)->lli_fid;
3328         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3329                ldlm_lockname[mode]);
3330
3331         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3332         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3333                 policy.l_inodebits.bits = *bits & (1 << i);
3334                 if (policy.l_inodebits.bits == 0)
3335                         continue;
3336
3337                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3338                                   &policy, mode, &lockh)) {
3339                         struct ldlm_lock *lock;
3340
3341                         lock = ldlm_handle2lock(&lockh);
3342                         if (lock) {
3343                                 *bits &=
3344                                       ~(lock->l_policy_data.l_inodebits.bits);
3345                                 LDLM_LOCK_PUT(lock);
3346                         } else {
3347                                 *bits &= ~policy.l_inodebits.bits;
3348                         }
3349                 }
3350         }
3351         RETURN(*bits == 0);
3352 }
3353
3354 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3355                                struct lustre_handle *lockh, __u64 flags,
3356                                enum ldlm_mode mode)
3357 {
3358         union ldlm_policy_data policy = { .l_inodebits = { bits } };
3359         struct lu_fid *fid;
3360         enum ldlm_mode rc;
3361         ENTRY;
3362
3363         fid = &ll_i2info(inode)->lli_fid;
3364         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3365
3366         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3367                            fid, LDLM_IBITS, &policy, mode, lockh);
3368
3369         RETURN(rc);
3370 }
3371
3372 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3373 {
3374         /* Already unlinked. Just update nlink and return success */
3375         if (rc == -ENOENT) {
3376                 clear_nlink(inode);
3377                 /* If it is striped directory, and there is bad stripe
3378                  * Let's revalidate the dentry again, instead of returning
3379                  * error */
3380                 if (S_ISDIR(inode->i_mode) &&
3381                     ll_i2info(inode)->lli_lsm_md != NULL)
3382                         return 0;
3383
3384                 /* This path cannot be hit for regular files unless in
3385                  * case of obscure races, so no need to to validate
3386                  * size. */
3387                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3388                         return 0;
3389         } else if (rc != 0) {
3390                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3391                              "%s: revalidate FID "DFID" error: rc = %d\n",
3392                              ll_get_fsname(inode->i_sb, NULL, 0),
3393                              PFID(ll_inode2fid(inode)), rc);
3394         }
3395
3396         return rc;
3397 }
3398
3399 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3400 {
3401         struct inode *inode = dentry->d_inode;
3402         struct ptlrpc_request *req = NULL;
3403         struct obd_export *exp;
3404         int rc = 0;
3405         ENTRY;
3406
3407         LASSERT(inode != NULL);
3408
3409         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3410                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3411
3412         exp = ll_i2mdexp(inode);
3413
3414         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3415          *      But under CMD case, it caused some lock issues, should be fixed
3416          *      with new CMD ibits lock. See bug 12718 */
3417         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3418                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3419                 struct md_op_data *op_data;
3420
3421                 if (ibits == MDS_INODELOCK_LOOKUP)
3422                         oit.it_op = IT_LOOKUP;
3423
3424                 /* Call getattr by fid, so do not provide name at all. */
3425                 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3426                                              dentry->d_inode, NULL, 0, 0,
3427                                              LUSTRE_OPC_ANY, NULL);
3428                 if (IS_ERR(op_data))
3429                         RETURN(PTR_ERR(op_data));
3430
3431                 rc = md_intent_lock(exp, op_data, &oit, &req,
3432                                     &ll_md_blocking_ast, 0);
3433                 ll_finish_md_op_data(op_data);
3434                 if (rc < 0) {
3435                         rc = ll_inode_revalidate_fini(inode, rc);
3436                         GOTO (out, rc);
3437                 }
3438
3439                 rc = ll_revalidate_it_finish(req, &oit, dentry);
3440                 if (rc != 0) {
3441                         ll_intent_release(&oit);
3442                         GOTO(out, rc);
3443                 }
3444
3445                 /* Unlinked? Unhash dentry, so it is not picked up later by
3446                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3447                    here to preserve get_cwd functionality on 2.6.
3448                    Bug 10503 */
3449                 if (!dentry->d_inode->i_nlink) {
3450                         ll_lock_dcache(inode);
3451                         d_lustre_invalidate(dentry, 0);
3452                         ll_unlock_dcache(inode);
3453                 }
3454
3455                 ll_lookup_finish_locks(&oit, dentry);
3456         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3457                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3458                 u64 valid = OBD_MD_FLGETATTR;
3459                 struct md_op_data *op_data;
3460                 int ealen = 0;
3461
3462                 if (S_ISREG(inode->i_mode)) {
3463                         rc = ll_get_default_mdsize(sbi, &ealen);
3464                         if (rc)
3465                                 RETURN(rc);
3466                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3467                 }
3468
3469                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3470                                              0, ealen, LUSTRE_OPC_ANY,
3471                                              NULL);
3472                 if (IS_ERR(op_data))
3473                         RETURN(PTR_ERR(op_data));
3474
3475                 op_data->op_valid = valid;
3476                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3477                 ll_finish_md_op_data(op_data);
3478                 if (rc) {
3479                         rc = ll_inode_revalidate_fini(inode, rc);
3480                         RETURN(rc);
3481                 }
3482
3483                 rc = ll_prep_inode(&inode, req, NULL, NULL);
3484         }
3485 out:
3486         ptlrpc_req_finished(req);
3487         return rc;
3488 }
3489
3490 static int ll_merge_md_attr(struct inode *inode)
3491 {
3492         struct cl_attr attr = { 0 };
3493         int rc;
3494
3495         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3496         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3497                            &attr, ll_md_blocking_ast);
3498         if (rc != 0)
3499                 RETURN(rc);
3500
3501         set_nlink(inode, attr.cat_nlink);
3502         inode->i_blocks = attr.cat_blocks;
3503         i_size_write(inode, attr.cat_size);
3504
3505         ll_i2info(inode)->lli_atime = attr.cat_atime;
3506         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3507         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3508
3509         RETURN(0);
3510 }
3511
3512 static int
3513 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3514 {
3515         struct inode    *inode = dentry->d_inode;
3516         int              rc;
3517         ENTRY;
3518
3519         rc = __ll_inode_revalidate(dentry, ibits);
3520         if (rc != 0)
3521                 RETURN(rc);
3522
3523         /* if object isn't regular file, don't validate size */
3524         if (!S_ISREG(inode->i_mode)) {
3525                 if (S_ISDIR(inode->i_mode) &&
3526                     ll_i2info(inode)->lli_lsm_md != NULL) {
3527                         rc = ll_merge_md_attr(inode);
3528                         if (rc != 0)
3529                                 RETURN(rc);
3530                 }
3531
3532                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3533                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3534                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3535         } else {
3536                 /* In case of restore, the MDT has the right size and has
3537                  * already send it back without granting the layout lock,
3538                  * inode is up-to-date so glimpse is useless.
3539                  * Also to glimpse we need the layout, in case of a running
3540                  * restore the MDT holds the layout lock so the glimpse will
3541                  * block up to the end of restore (getattr will block)
3542                  */
3543                 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3544                         rc = ll_glimpse_size(inode);
3545         }
3546         RETURN(rc);
3547 }
3548
3549 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3550 {
3551         struct inode *inode = de->d_inode;
3552         struct ll_sb_info *sbi = ll_i2sbi(inode);
3553         struct ll_inode_info *lli = ll_i2info(inode);
3554         int res = 0;
3555
3556         res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3557                                       MDS_INODELOCK_LOOKUP);
3558         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3559
3560         if (res)
3561                 return res;
3562
3563         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3564
3565         stat->dev = inode->i_sb->s_dev;
3566         if (ll_need_32bit_api(sbi))
3567                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3568         else
3569                 stat->ino = inode->i_ino;
3570         stat->mode = inode->i_mode;
3571         stat->uid = inode->i_uid;
3572         stat->gid = inode->i_gid;
3573         stat->rdev = inode->i_rdev;
3574         stat->atime = inode->i_atime;
3575         stat->mtime = inode->i_mtime;
3576         stat->ctime = inode->i_ctime;
3577         stat->blksize = 1 << inode->i_blkbits;
3578
3579         stat->nlink = inode->i_nlink;
3580         stat->size = i_size_read(inode);
3581         stat->blocks = inode->i_blocks;
3582
3583         return 0;
3584 }
3585
3586 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3587                      __u64 start, __u64 len)
3588 {
3589         int             rc;
3590         size_t          num_bytes;
3591         struct fiemap   *fiemap;
3592         unsigned int    extent_count = fieinfo->fi_extents_max;
3593
3594         num_bytes = sizeof(*fiemap) + (extent_count *
3595                                        sizeof(struct fiemap_extent));
3596         OBD_ALLOC_LARGE(fiemap, num_bytes);
3597
3598         if (fiemap == NULL)
3599                 RETURN(-ENOMEM);
3600
3601         fiemap->fm_flags = fieinfo->fi_flags;
3602         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3603         fiemap->fm_start = start;
3604         fiemap->fm_length = len;
3605         if (extent_count > 0 &&
3606             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3607                            sizeof(struct fiemap_extent)) != 0)
3608                 GOTO(out, rc = -EFAULT);
3609
3610         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3611
3612         fieinfo->fi_flags = fiemap->fm_flags;
3613         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3614         if (extent_count > 0 &&
3615             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3616                          fiemap->fm_mapped_extents *
3617                          sizeof(struct fiemap_extent)) != 0)
3618                 GOTO(out, rc = -EFAULT);
3619 out:
3620         OBD_FREE_LARGE(fiemap, num_bytes);
3621         return rc;
3622 }
3623
3624 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3625 {
3626         struct ll_inode_info *lli = ll_i2info(inode);
3627         struct posix_acl *acl = NULL;
3628         ENTRY;
3629
3630         spin_lock(&lli->lli_lock);
3631         /* VFS' acl_permission_check->check_acl will release the refcount */
3632         acl = posix_acl_dup(lli->lli_posix_acl);
3633         spin_unlock(&lli->lli_lock);
3634
3635         RETURN(acl);
3636 }
3637
3638 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3639 static int
3640 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3641 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3642 # else
3643 ll_check_acl(struct inode *inode, int mask)
3644 # endif
3645 {
3646 # ifdef CONFIG_FS_POSIX_ACL
3647         struct posix_acl *acl;
3648         int rc;
3649         ENTRY;
3650
3651 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
3652         if (flags & IPERM_FLAG_RCU)
3653                 return -ECHILD;
3654 #  endif
3655         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3656
3657         if (!acl)
3658                 RETURN(-EAGAIN);
3659
3660         rc = posix_acl_permission(inode, acl, mask);
3661         posix_acl_release(acl);
3662
3663         RETURN(rc);
3664 # else /* !CONFIG_FS_POSIX_ACL */
3665         return -EAGAIN;
3666 # endif /* CONFIG_FS_POSIX_ACL */
3667 }
3668 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3669
3670 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3671 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3672 #else
3673 # ifdef HAVE_INODE_PERMISION_2ARGS
3674 int ll_inode_permission(struct inode *inode, int mask)
3675 # else
3676 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3677 # endif
3678 #endif
3679 {
3680         int rc = 0;
3681         struct ll_sb_info *sbi;
3682         struct root_squash_info *squash;
3683         struct cred *cred = NULL;
3684         const struct cred *old_cred = NULL;
3685         cfs_cap_t cap;
3686         bool squash_id = false;
3687         ENTRY;
3688
3689 #ifdef MAY_NOT_BLOCK
3690         if (mask & MAY_NOT_BLOCK)
3691                 return -ECHILD;
3692 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3693         if (flags & IPERM_FLAG_RCU)
3694                 return -ECHILD;
3695 #endif
3696
3697        /* as root inode are NOT getting validated in lookup operation,
3698         * need to do it before permission check. */
3699
3700         if (inode == inode->i_sb->s_root->d_inode) {
3701                 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3702                                            MDS_INODELOCK_LOOKUP);
3703                 if (rc)
3704                         RETURN(rc);
3705         }
3706
3707         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3708                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3709
3710         /* squash fsuid/fsgid if needed */
3711         sbi = ll_i2sbi(inode);
3712         squash = &sbi->ll_squash;
3713         if (unlikely(squash->rsi_uid != 0 &&
3714                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3715                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3716                         squash_id = true;
3717         }
3718         if (squash_id) {
3719                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3720                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3721                        squash->rsi_uid, squash->rsi_gid);
3722
3723                 /* update current process's credentials
3724                  * and FS capability */
3725                 cred = prepare_creds();
3726                 if (cred == NULL)
3727                         RETURN(-ENOMEM);
3728
3729                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3730                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3731                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3732                         if ((1 << cap) & CFS_CAP_FS_MASK)
3733                                 cap_lower(cred->cap_effective, cap);
3734                 }
3735                 old_cred = override_creds(cred);
3736         }
3737
3738         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3739         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3740         /* restore current process's credentials and FS capability */
3741         if (squash_id) {
3742                 revert_creds(old_cred);
3743                 put_cred(cred);
3744         }
3745
3746         RETURN(rc);
3747 }
3748
3749 /* -o localflock - only provides locally consistent flock locks */
3750 struct file_operations ll_file_operations = {
3751 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3752 # ifdef HAVE_SYNC_READ_WRITE
3753         .read           = new_sync_read,
3754         .write          = new_sync_write,
3755 # endif
3756         .read_iter      = ll_file_read_iter,
3757         .write_iter     = ll_file_write_iter,
3758 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3759         .read           = ll_file_read,
3760         .aio_read       = ll_file_aio_read,
3761         .write          = ll_file_write,
3762         .aio_write      = ll_file_aio_write,
3763 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3764         .unlocked_ioctl = ll_file_ioctl,
3765         .open           = ll_file_open,
3766         .release        = ll_file_release,
3767         .mmap           = ll_file_mmap,
3768         .llseek         = ll_file_seek,
3769         .splice_read    = ll_file_splice_read,
3770         .fsync          = ll_fsync,
3771         .flush          = ll_flush
3772 };
3773
3774 struct file_operations ll_file_operations_flock = {
3775 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3776 # ifdef HAVE_SYNC_READ_WRITE
3777         .read           = new_sync_read,
3778         .write          = new_sync_write,
3779 # endif /* HAVE_SYNC_READ_WRITE */
3780         .read_iter      = ll_file_read_iter,
3781         .write_iter     = ll_file_write_iter,
3782 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3783         .read           = ll_file_read,
3784         .aio_read       = ll_file_aio_read,
3785         .write          = ll_file_write,
3786         .aio_write      = ll_file_aio_write,
3787 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3788         .unlocked_ioctl = ll_file_ioctl,
3789         .open           = ll_file_open,
3790         .release        = ll_file_release,
3791         .mmap           = ll_file_mmap,
3792         .llseek         = ll_file_seek,
3793         .splice_read    = ll_file_splice_read,
3794         .fsync          = ll_fsync,
3795         .flush          = ll_flush,
3796         .flock          = ll_file_flock,
3797         .lock           = ll_file_flock
3798 };
3799
3800 /* These are for -o noflock - to return ENOSYS on flock calls */
3801 struct file_operations ll_file_operations_noflock = {
3802 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3803 # ifdef HAVE_SYNC_READ_WRITE
3804         .read           = new_sync_read,
3805         .write          = new_sync_write,
3806 # endif /* HAVE_SYNC_READ_WRITE */
3807         .read_iter      = ll_file_read_iter,
3808         .write_iter     = ll_file_write_iter,
3809 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3810         .read           = ll_file_read,
3811         .aio_read       = ll_file_aio_read,
3812         .write          = ll_file_write,
3813         .aio_write      = ll_file_aio_write,
3814 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3815         .unlocked_ioctl = ll_file_ioctl,
3816         .open           = ll_file_open,
3817         .release        = ll_file_release,
3818         .mmap           = ll_file_mmap,
3819         .llseek         = ll_file_seek,
3820         .splice_read    = ll_file_splice_read,
3821         .fsync          = ll_fsync,
3822         .flush          = ll_flush,
3823         .flock          = ll_file_noflock,
3824         .lock           = ll_file_noflock
3825 };
3826
3827 struct inode_operations ll_file_inode_operations = {
3828         .setattr        = ll_setattr,
3829         .getattr        = ll_getattr,
3830         .permission     = ll_inode_permission,
3831         .setxattr       = ll_setxattr,
3832         .getxattr       = ll_getxattr,
3833         .listxattr      = ll_listxattr,
3834         .removexattr    = ll_removexattr,
3835         .fiemap         = ll_fiemap,
3836 #ifdef HAVE_IOP_GET_ACL
3837         .get_acl        = ll_get_acl,
3838 #endif
3839 };
3840
3841 /* dynamic ioctl number support routins */
3842 static struct llioc_ctl_data {
3843         struct rw_semaphore     ioc_sem;
3844         struct list_head        ioc_head;
3845 } llioc = {
3846         __RWSEM_INITIALIZER(llioc.ioc_sem),
3847         LIST_HEAD_INIT(llioc.ioc_head)
3848 };
3849
3850
3851 struct llioc_data {
3852         struct list_head        iocd_list;
3853         unsigned int            iocd_size;
3854         llioc_callback_t        iocd_cb;
3855         unsigned int            iocd_count;
3856         unsigned int            iocd_cmd[0];
3857 };
3858
3859 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3860 {
3861         unsigned int size;
3862         struct llioc_data *in_data = NULL;
3863         ENTRY;
3864
3865         if (cb == NULL || cmd == NULL ||
3866             count > LLIOC_MAX_CMD || count < 0)
3867                 RETURN(NULL);
3868
3869         size = sizeof(*in_data) + count * sizeof(unsigned int);
3870         OBD_ALLOC(in_data, size);
3871         if (in_data == NULL)
3872                 RETURN(NULL);
3873
3874         memset(in_data, 0, sizeof(*in_data));
3875         in_data->iocd_size = size;
3876         in_data->iocd_cb = cb;
3877         in_data->iocd_count = count;
3878         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3879
3880         down_write(&llioc.ioc_sem);
3881         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3882         up_write(&llioc.ioc_sem);
3883
3884         RETURN(in_data);
3885 }
3886
3887 void ll_iocontrol_unregister(void *magic)
3888 {
3889         struct llioc_data *tmp;
3890
3891         if (magic == NULL)
3892                 return;
3893
3894         down_write(&llioc.ioc_sem);
3895         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3896                 if (tmp == magic) {
3897                         unsigned int size = tmp->iocd_size;
3898
3899                         list_del(&tmp->iocd_list);
3900                         up_write(&llioc.ioc_sem);
3901
3902                         OBD_FREE(tmp, size);
3903                         return;
3904                 }
3905         }
3906         up_write(&llioc.ioc_sem);
3907
3908         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3909 }
3910
3911 EXPORT_SYMBOL(ll_iocontrol_register);
3912 EXPORT_SYMBOL(ll_iocontrol_unregister);
3913
3914 static enum llioc_iter
3915 ll_iocontrol_call(struct inode *inode, struct file *file,
3916                   unsigned int cmd, unsigned long arg, int *rcp)
3917 {
3918         enum llioc_iter ret = LLIOC_CONT;
3919         struct llioc_data *data;
3920         int rc = -EINVAL, i;
3921
3922         down_read(&llioc.ioc_sem);
3923         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3924                 for (i = 0; i < data->iocd_count; i++) {
3925                         if (cmd != data->iocd_cmd[i])
3926                                 continue;
3927
3928                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3929                         break;
3930                 }
3931
3932                 if (ret == LLIOC_STOP)
3933                         break;
3934         }
3935         up_read(&llioc.ioc_sem);
3936
3937         if (rcp)
3938                 *rcp = rc;
3939         return ret;
3940 }
3941
3942 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3943 {
3944         struct ll_inode_info *lli = ll_i2info(inode);
3945         struct cl_object *obj = lli->lli_clob;
3946         struct lu_env *env;
3947         int rc;
3948         __u16 refcheck;
3949         ENTRY;
3950
3951         if (obj == NULL)
3952                 RETURN(0);
3953
3954         env = cl_env_get(&refcheck);
3955         if (IS_ERR(env))
3956                 RETURN(PTR_ERR(env));
3957
3958         rc = cl_conf_set(env, lli->lli_clob, conf);
3959         if (rc < 0)
3960                 GOTO(out, rc);
3961
3962         if (conf->coc_opc == OBJECT_CONF_SET) {
3963                 struct ldlm_lock *lock = conf->coc_lock;
3964                 struct cl_layout cl = {
3965                         .cl_layout_gen = 0,
3966                 };
3967
3968                 LASSERT(lock != NULL);
3969                 LASSERT(ldlm_has_layout(lock));
3970
3971                 /* it can only be allowed to match after layout is
3972                  * applied to inode otherwise false layout would be
3973                  * seen. Applying layout shoud happen before dropping
3974                  * the intent lock. */
3975                 ldlm_lock_allow_match(lock);
3976
3977                 rc = cl_object_layout_get(env, obj, &cl);
3978                 if (rc < 0)
3979                         GOTO(out, rc);
3980
3981                 CDEBUG(D_VFSTRACE,
3982                        DFID": layout version change: %u -> %u\n",
3983                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
3984                        cl.cl_layout_gen);
3985                 ll_layout_version_set(lli, cl.cl_layout_gen);
3986         }
3987
3988 out:
3989         cl_env_put(env, &refcheck);
3990
3991         RETURN(rc);
3992 }
3993
3994 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3995 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3996
3997 {
3998         struct ll_sb_info *sbi = ll_i2sbi(inode);
3999         struct ptlrpc_request *req;
4000         struct mdt_body *body;
4001         void *lvbdata;
4002         void *lmm;
4003         int lmmsize;
4004         int rc;
4005         ENTRY;
4006
4007         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4008                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4009                lock->l_lvb_data, lock->l_lvb_len);
4010
4011         if (lock->l_lvb_data != NULL)
4012                 RETURN(0);
4013
4014         /* if layout lock was granted right away, the layout is returned
4015          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4016          * blocked and then granted via completion ast, we have to fetch
4017          * layout here. Please note that we can't use the LVB buffer in
4018          * completion AST because it doesn't have a large enough buffer */
4019         rc = ll_get_default_mdsize(sbi, &lmmsize);
4020         if (rc == 0)
4021                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4022                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4023                                 lmmsize, 0, &req);
4024         if (rc < 0)
4025                 RETURN(rc);
4026
4027         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4028         if (body == NULL)
4029                 GOTO(out, rc = -EPROTO);
4030
4031         lmmsize = body->mbo_eadatasize;
4032         if (lmmsize == 0) /* empty layout */
4033                 GOTO(out, rc = 0);
4034
4035         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4036         if (lmm == NULL)
4037                 GOTO(out, rc = -EFAULT);
4038
4039         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4040         if (lvbdata == NULL)
4041                 GOTO(out, rc = -ENOMEM);
4042
4043         memcpy(lvbdata, lmm, lmmsize);
4044         lock_res_and_lock(lock);
4045         if (unlikely(lock->l_lvb_data == NULL)) {
4046                 lock->l_lvb_type = LVB_T_LAYOUT;
4047                 lock->l_lvb_data = lvbdata;
4048                 lock->l_lvb_len = lmmsize;
4049                 lvbdata = NULL;
4050         }
4051         unlock_res_and_lock(lock);
4052
4053         if (lvbdata)
4054                 OBD_FREE_LARGE(lvbdata, lmmsize);
4055
4056         EXIT;
4057
4058 out:
4059         ptlrpc_req_finished(req);
4060         return rc;
4061 }
4062
4063 /**
4064  * Apply the layout to the inode. Layout lock is held and will be released
4065  * in this function.
4066  */
4067 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4068                               struct inode *inode)
4069 {
4070         struct ll_inode_info *lli = ll_i2info(inode);
4071         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4072         struct ldlm_lock *lock;
4073         struct cl_object_conf conf;
4074         int rc = 0;
4075         bool lvb_ready;
4076         bool wait_layout = false;
4077         ENTRY;
4078
4079         LASSERT(lustre_handle_is_used(lockh));
4080
4081         lock = ldlm_handle2lock(lockh);
4082         LASSERT(lock != NULL);
4083         LASSERT(ldlm_has_layout(lock));
4084
4085         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4086                    PFID(&lli->lli_fid), inode);
4087
4088         /* in case this is a caching lock and reinstate with new inode */
4089         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4090
4091         lock_res_and_lock(lock);
4092         lvb_ready = ldlm_is_lvb_ready(lock);
4093         unlock_res_and_lock(lock);
4094         /* checking lvb_ready is racy but this is okay. The worst case is
4095          * that multi processes may configure the file on the same time. */
4096
4097         if (lvb_ready)
4098                 GOTO(out, rc = 0);
4099
4100         rc = ll_layout_fetch(inode, lock);
4101         if (rc < 0)
4102                 GOTO(out, rc);
4103
4104         /* for layout lock, lmm is stored in lock's lvb.
4105          * lvb_data is immutable if the lock is held so it's safe to access it
4106          * without res lock.
4107          *
4108          * set layout to file. Unlikely this will fail as old layout was
4109          * surely eliminated */
4110         memset(&conf, 0, sizeof conf);
4111         conf.coc_opc = OBJECT_CONF_SET;
4112         conf.coc_inode = inode;
4113         conf.coc_lock = lock;
4114         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4115         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4116         rc = ll_layout_conf(inode, &conf);
4117
4118         /* refresh layout failed, need to wait */
4119         wait_layout = rc == -EBUSY;
4120         EXIT;
4121
4122 out:
4123         LDLM_LOCK_PUT(lock);
4124         ldlm_lock_decref(lockh, mode);
4125
4126         /* wait for IO to complete if it's still being used. */
4127         if (wait_layout) {
4128                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4129                        ll_get_fsname(inode->i_sb, NULL, 0),
4130                        PFID(&lli->lli_fid), inode);
4131
4132                 memset(&conf, 0, sizeof conf);
4133                 conf.coc_opc = OBJECT_CONF_WAIT;
4134                 conf.coc_inode = inode;
4135                 rc = ll_layout_conf(inode, &conf);
4136                 if (rc == 0)
4137                         rc = -EAGAIN;
4138
4139                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4140                        ll_get_fsname(inode->i_sb, NULL, 0),
4141                        PFID(&lli->lli_fid), rc);
4142         }
4143         RETURN(rc);
4144 }
4145
4146 static int ll_layout_refresh_locked(struct inode *inode)
4147 {
4148         struct ll_inode_info  *lli = ll_i2info(inode);
4149         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4150         struct md_op_data     *op_data;
4151         struct lookup_intent    it;
4152         struct lustre_handle    lockh;
4153         enum ldlm_mode          mode;
4154         struct ldlm_enqueue_info einfo = {
4155                 .ei_type = LDLM_IBITS,
4156                 .ei_mode = LCK_CR,
4157                 .ei_cb_bl = &ll_md_blocking_ast,
4158                 .ei_cb_cp = &ldlm_completion_ast,
4159         };
4160         int rc;
4161         ENTRY;
4162
4163 again:
4164         /* mostly layout lock is caching on the local side, so try to match
4165          * it before grabbing layout lock mutex. */
4166         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4167                                LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4168         if (mode != 0) { /* hit cached lock */
4169                 rc = ll_layout_lock_set(&lockh, mode, inode);
4170                 if (rc == -EAGAIN)
4171                         goto again;
4172
4173                 RETURN(rc);
4174         }
4175
4176         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4177                                      0, 0, LUSTRE_OPC_ANY, NULL);
4178         if (IS_ERR(op_data))
4179                 RETURN(PTR_ERR(op_data));
4180
4181         /* have to enqueue one */
4182         memset(&it, 0, sizeof(it));
4183         it.it_op = IT_LAYOUT;
4184         lockh.cookie = 0ULL;
4185
4186         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4187                           ll_get_fsname(inode->i_sb, NULL, 0),
4188                           PFID(&lli->lli_fid), inode);
4189
4190         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4191         if (it.it_request != NULL)
4192                 ptlrpc_req_finished(it.it_request);
4193         it.it_request = NULL;
4194
4195         ll_finish_md_op_data(op_data);
4196
4197         mode = it.it_lock_mode;
4198         it.it_lock_mode = 0;
4199         ll_intent_drop_lock(&it);
4200
4201         if (rc == 0) {
4202                 /* set lock data in case this is a new lock */
4203                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4204                 rc = ll_layout_lock_set(&lockh, mode, inode);
4205                 if (rc == -EAGAIN)
4206                         goto again;
4207         }
4208
4209         RETURN(rc);
4210 }
4211
4212 /**
4213  * This function checks if there exists a LAYOUT lock on the client side,
4214  * or enqueues it if it doesn't have one in cache.
4215  *
4216  * This function will not hold layout lock so it may be revoked any time after
4217  * this function returns. Any operations depend on layout should be redone
4218  * in that case.
4219  *
4220  * This function should be called before lov_io_init() to get an uptodate
4221  * layout version, the caller should save the version number and after IO
4222  * is finished, this function should be called again to verify that layout
4223  * is not changed during IO time.
4224  */
4225 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4226 {
4227         struct ll_inode_info    *lli = ll_i2info(inode);
4228         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4229         int rc;
4230         ENTRY;
4231
4232         *gen = ll_layout_version_get(lli);
4233         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4234                 RETURN(0);
4235
4236         /* sanity checks */
4237         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4238         LASSERT(S_ISREG(inode->i_mode));
4239
4240         /* take layout lock mutex to enqueue layout lock exclusively. */
4241         mutex_lock(&lli->lli_layout_mutex);
4242
4243         rc = ll_layout_refresh_locked(inode);
4244         if (rc < 0)
4245                 GOTO(out, rc);
4246
4247         *gen = ll_layout_version_get(lli);
4248 out:
4249         mutex_unlock(&lli->lli_layout_mutex);
4250
4251         RETURN(rc);
4252 }
4253
4254 /**
4255  *  This function send a restore request to the MDT
4256  */
4257 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4258 {
4259         struct hsm_user_request *hur;
4260         int                      len, rc;
4261         ENTRY;
4262
4263         len = sizeof(struct hsm_user_request) +
4264               sizeof(struct hsm_user_item);
4265         OBD_ALLOC(hur, len);
4266         if (hur == NULL)
4267                 RETURN(-ENOMEM);
4268
4269         hur->hur_request.hr_action = HUA_RESTORE;
4270         hur->hur_request.hr_archive_id = 0;
4271         hur->hur_request.hr_flags = 0;
4272         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4273                sizeof(hur->hur_user_item[0].hui_fid));
4274         hur->hur_user_item[0].hui_extent.offset = offset;
4275         hur->hur_user_item[0].hui_extent.length = length;
4276         hur->hur_request.hr_itemcount = 1;
4277         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
4278                            len, hur, NULL);
4279         OBD_FREE(hur, len);
4280         RETURN(rc);
4281 }