lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2016, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48 #include <lustre/ll_fiemap.h>
  49
  50 #include <lustre_ioctl.h>
  51 #include <lustre_swab.h>
  52
  53 #include "cl_object.h"
  54 #include "llite_internal.h"
  55 #include "vvp_internal.h"
  56
  57 static int
  58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  59
  60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  61                           bool *lease_broken);
  62
  63 static enum llioc_iter
  64 ll_iocontrol_call(struct inode *inode, struct file *file,
  65                   unsigned int cmd, unsigned long arg, int *rcp);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_SWAP:
 153                 LASSERT(data != NULL);
 154                 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
 155                 op_data->op_data_version = 0;
 156                 op_data->op_lease_handle = och->och_lease_handle;
 157                 op_data->op_fid2 = *ll_inode2fid(data);
 158                 break;
 159
 160         case MDS_HSM_RELEASE:
 161                 LASSERT(data != NULL);
 162                 op_data->op_bias |= MDS_HSM_RELEASE;
 163                 op_data->op_data_version = *(__u64 *)data;
 164                 op_data->op_lease_handle = och->och_lease_handle;
 165                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 166                 break;
 167
 168         default:
 169                 LASSERT(data == NULL);
 170                 break;
 171         }
 172
 173         rc = md_close(md_exp, op_data, och->och_mod, &req);
 174         if (rc != 0 && rc != -EINTR)
 175                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 176                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 177
 178         if (rc == 0 &&
 179             op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
 180                 struct mdt_body *body;
 181
 182                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 183                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 184                         rc = -EBUSY;
 185         }
 186
 187         ll_finish_md_op_data(op_data);
 188         EXIT;
 189 out:
 190
 191         md_clear_open_replay_data(md_exp, och);
 192         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 193         OBD_FREE_PTR(och);
 194
 195         ptlrpc_req_finished(req);       /* This is close request */
 196         return rc;
 197 }
 198
 199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 200 {
 201         struct ll_inode_info *lli = ll_i2info(inode);
 202         struct obd_client_handle **och_p;
 203         struct obd_client_handle *och;
 204         __u64 *och_usecount;
 205         int rc = 0;
 206         ENTRY;
 207
 208         if (fmode & FMODE_WRITE) {
 209                 och_p = &lli->lli_mds_write_och;
 210                 och_usecount = &lli->lli_open_fd_write_count;
 211         } else if (fmode & FMODE_EXEC) {
 212                 och_p = &lli->lli_mds_exec_och;
 213                 och_usecount = &lli->lli_open_fd_exec_count;
 214         } else {
 215                 LASSERT(fmode & FMODE_READ);
 216                 och_p = &lli->lli_mds_read_och;
 217                 och_usecount = &lli->lli_open_fd_read_count;
 218         }
 219
 220         mutex_lock(&lli->lli_och_mutex);
 221         if (*och_usecount > 0) {
 222                 /* There are still users of this handle, so skip
 223                  * freeing it. */
 224                 mutex_unlock(&lli->lli_och_mutex);
 225                 RETURN(0);
 226         }
 227
 228         och = *och_p;
 229         *och_p = NULL;
 230         mutex_unlock(&lli->lli_och_mutex);
 231
 232         if (och != NULL) {
 233                 /* There might be a race and this handle may already
 234                  * be closed. */
 235                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 236         }
 237
 238         RETURN(rc);
 239 }
 240
 241 static int ll_md_close(struct inode *inode, struct file *file)
 242 {
 243         union ldlm_policy_data policy = {
 244                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 245         };
 246         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 247         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 248         struct ll_inode_info *lli = ll_i2info(inode);
 249         struct lustre_handle lockh;
 250         enum ldlm_mode lockmode;
 251         int rc = 0;
 252         ENTRY;
 253
 254         /* clear group lock, if present */
 255         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 256                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 257
 258         if (fd->fd_lease_och != NULL) {
 259                 bool lease_broken;
 260
 261                 /* Usually the lease is not released when the
 262                  * application crashed, we need to release here. */
 263                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 264                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 265                         PFID(&lli->lli_fid), rc, lease_broken);
 266
 267                 fd->fd_lease_och = NULL;
 268         }
 269
 270         if (fd->fd_och != NULL) {
 271                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 272                 fd->fd_och = NULL;
 273                 GOTO(out, rc);
 274         }
 275
 276         /* Let's see if we have good enough OPEN lock on the file and if
 277            we can skip talking to MDS */
 278         mutex_lock(&lli->lli_och_mutex);
 279         if (fd->fd_omode & FMODE_WRITE) {
 280                 lockmode = LCK_CW;
 281                 LASSERT(lli->lli_open_fd_write_count);
 282                 lli->lli_open_fd_write_count--;
 283         } else if (fd->fd_omode & FMODE_EXEC) {
 284                 lockmode = LCK_PR;
 285                 LASSERT(lli->lli_open_fd_exec_count);
 286                 lli->lli_open_fd_exec_count--;
 287         } else {
 288                 lockmode = LCK_CR;
 289                 LASSERT(lli->lli_open_fd_read_count);
 290                 lli->lli_open_fd_read_count--;
 291         }
 292         mutex_unlock(&lli->lli_och_mutex);
 293
 294         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 295                            LDLM_IBITS, &policy, lockmode, &lockh))
 296                 rc = ll_md_real_close(inode, fd->fd_omode);
 297
 298 out:
 299         LUSTRE_FPRIVATE(file) = NULL;
 300         ll_file_data_put(fd);
 301
 302         RETURN(rc);
 303 }
 304
 305 /* While this returns an error code, fput() the caller does not, so we need
 306  * to make every effort to clean up all of our state here.  Also, applications
 307  * rarely check close errors and even if an error is returned they will not
 308  * re-try the close call.
 309  */
 310 int ll_file_release(struct inode *inode, struct file *file)
 311 {
 312         struct ll_file_data *fd;
 313         struct ll_sb_info *sbi = ll_i2sbi(inode);
 314         struct ll_inode_info *lli = ll_i2info(inode);
 315         int rc;
 316         ENTRY;
 317
 318         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 319                PFID(ll_inode2fid(inode)), inode);
 320
 321         if (inode->i_sb->s_root != file_dentry(file))
 322                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 323         fd = LUSTRE_FPRIVATE(file);
 324         LASSERT(fd != NULL);
 325
 326         /* The last ref on @file, maybe not the the owner pid of statahead,
 327          * because parent and child process can share the same file handle. */
 328         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 329                 ll_deauthorize_statahead(inode, fd);
 330
 331         if (inode->i_sb->s_root == file_dentry(file)) {
 332                 LUSTRE_FPRIVATE(file) = NULL;
 333                 ll_file_data_put(fd);
 334                 RETURN(0);
 335         }
 336
 337         if (!S_ISDIR(inode->i_mode)) {
 338                 if (lli->lli_clob != NULL)
 339                         lov_read_and_clear_async_rc(lli->lli_clob);
 340                 lli->lli_async_rc = 0;
 341         }
 342
 343         rc = ll_md_close(inode, file);
 344
 345         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 346                 libcfs_debug_dumplog();
 347
 348         RETURN(rc);
 349 }
 350
 351 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
 352                                 struct lookup_intent *itp)
 353 {
 354         struct dentry *de = file_dentry(file);
 355         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 356         struct dentry *parent = de->d_parent;
 357         const char *name = NULL;
 358         int len = 0;
 359         struct md_op_data *op_data;
 360         struct ptlrpc_request *req = NULL;
 361         int rc;
 362         ENTRY;
 363
 364         LASSERT(parent != NULL);
 365         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 366
 367         /* if server supports open-by-fid, or file name is invalid, don't pack
 368          * name in open request */
 369         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 370             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 371                 name = de->d_name.name;
 372                 len = de->d_name.len;
 373         }
 374
 375         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 376                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 377         if (IS_ERR(op_data))
 378                 RETURN(PTR_ERR(op_data));
 379         op_data->op_data = lmm;
 380         op_data->op_data_size = lmmsize;
 381
 382         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 383                             &ll_md_blocking_ast, 0);
 384         ll_finish_md_op_data(op_data);
 385         if (rc == -ESTALE) {
 386                 /* reason for keep own exit path - don`t flood log
 387                  * with messages with -ESTALE errors.
 388                  */
 389                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 390                      it_open_error(DISP_OPEN_OPEN, itp))
 391                         GOTO(out, rc);
 392                 ll_release_openhandle(de, itp);
 393                 GOTO(out, rc);
 394         }
 395
 396         if (it_disposition(itp, DISP_LOOKUP_NEG))
 397                 GOTO(out, rc = -ENOENT);
 398
 399         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 400                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 401                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 402                 GOTO(out, rc);
 403         }
 404
 405         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 406         if (!rc && itp->it_lock_mode)
 407                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 408
 409 out:
 410         ptlrpc_req_finished(req);
 411         ll_intent_drop_lock(itp);
 412
 413         /* We did open by fid, but by the time we got to the server,
 414          * the object disappeared. If this is a create, we cannot really
 415          * tell the userspace that the file it was trying to create
 416          * does not exist. Instead let's return -ESTALE, and the VFS will
 417          * retry the create with LOOKUP_REVAL that we are going to catch
 418          * in ll_revalidate_dentry() and use lookup then.
 419          */
 420         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 421                 rc = -ESTALE;
 422
 423         RETURN(rc);
 424 }
 425
 426 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 427                        struct obd_client_handle *och)
 428 {
 429         struct mdt_body *body;
 430
 431         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 432         och->och_fh = body->mbo_handle;
 433         och->och_fid = body->mbo_fid1;
 434         och->och_lease_handle.cookie = it->it_lock_handle;
 435         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 436         och->och_flags = it->it_flags;
 437
 438         return md_set_open_replay_data(md_exp, och, it);
 439 }
 440
 441 static int ll_local_open(struct file *file, struct lookup_intent *it,
 442                          struct ll_file_data *fd, struct obd_client_handle *och)
 443 {
 444         struct inode *inode = file_inode(file);
 445         ENTRY;
 446
 447         LASSERT(!LUSTRE_FPRIVATE(file));
 448
 449         LASSERT(fd != NULL);
 450
 451         if (och) {
 452                 int rc;
 453
 454                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 455                 if (rc != 0)
 456                         RETURN(rc);
 457         }
 458
 459         LUSTRE_FPRIVATE(file) = fd;
 460         ll_readahead_init(inode, &fd->fd_ras);
 461         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 462
 463         /* ll_cl_context initialize */
 464         rwlock_init(&fd->fd_lock);
 465         INIT_LIST_HEAD(&fd->fd_lccs);
 466
 467         RETURN(0);
 468 }
 469
 470 /* Open a file, and (for the very first open) create objects on the OSTs at
 471  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 472  * creation or open until ll_lov_setstripe() ioctl is called.
 473  *
 474  * If we already have the stripe MD locally then we don't request it in
 475  * md_open(), by passing a lmm_size = 0.
 476  *
 477  * It is up to the application to ensure no other processes open this file
 478  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 479  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 480  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 481  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 482  */
 483 int ll_file_open(struct inode *inode, struct file *file)
 484 {
 485         struct ll_inode_info *lli = ll_i2info(inode);
 486         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 487                                           .it_flags = file->f_flags };
 488         struct obd_client_handle **och_p = NULL;
 489         __u64 *och_usecount = NULL;
 490         struct ll_file_data *fd;
 491         int rc = 0;
 492         ENTRY;
 493
 494         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 495                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 496
 497         it = file->private_data; /* XXX: compat macro */
 498         file->private_data = NULL; /* prevent ll_local_open assertion */
 499
 500         fd = ll_file_data_get();
 501         if (fd == NULL)
 502                 GOTO(out_openerr, rc = -ENOMEM);
 503
 504         fd->fd_file = file;
 505         if (S_ISDIR(inode->i_mode))
 506                 ll_authorize_statahead(inode, fd);
 507
 508         if (inode->i_sb->s_root == file_dentry(file)) {
 509                 LUSTRE_FPRIVATE(file) = fd;
 510                 RETURN(0);
 511         }
 512
 513         if (!it || !it->it_disposition) {
 514                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 515                  * because everything but O_ACCMODE mask was stripped from
 516                  * there */
 517                 if ((oit.it_flags + 1) & O_ACCMODE)
 518                         oit.it_flags++;
 519                 if (file->f_flags & O_TRUNC)
 520                         oit.it_flags |= FMODE_WRITE;
 521
 522                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 523                  * dentry_open after call to open_namei that checks permissions.
 524                  * Only nfsd_open call dentry_open directly without checking
 525                  * permissions and because of that this code below is safe. */
 526                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 527                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 528
 529                 /* We do not want O_EXCL here, presumably we opened the file
 530                  * already? XXX - NFS implications? */
 531                 oit.it_flags &= ~O_EXCL;
 532
 533                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 534                  * created if necessary, then "IT_CREAT" should be set to keep
 535                  * consistent with it */
 536                 if (oit.it_flags & O_CREAT)
 537                         oit.it_op |= IT_CREAT;
 538
 539                 it = &oit;
 540         }
 541
 542 restart:
 543         /* Let's see if we have file open on MDS already. */
 544         if (it->it_flags & FMODE_WRITE) {
 545                 och_p = &lli->lli_mds_write_och;
 546                 och_usecount = &lli->lli_open_fd_write_count;
 547         } else if (it->it_flags & FMODE_EXEC) {
 548                 och_p = &lli->lli_mds_exec_och;
 549                 och_usecount = &lli->lli_open_fd_exec_count;
 550          } else {
 551                 och_p = &lli->lli_mds_read_och;
 552                 och_usecount = &lli->lli_open_fd_read_count;
 553         }
 554
 555         mutex_lock(&lli->lli_och_mutex);
 556         if (*och_p) { /* Open handle is present */
 557                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 558                         /* Well, there's extra open request that we do not need,
 559                            let's close it somehow. This will decref request. */
 560                         rc = it_open_error(DISP_OPEN_OPEN, it);
 561                         if (rc) {
 562                                 mutex_unlock(&lli->lli_och_mutex);
 563                                 GOTO(out_openerr, rc);
 564                         }
 565
 566                         ll_release_openhandle(file_dentry(file), it);
 567                 }
 568                 (*och_usecount)++;
 569
 570                 rc = ll_local_open(file, it, fd, NULL);
 571                 if (rc) {
 572                         (*och_usecount)--;
 573                         mutex_unlock(&lli->lli_och_mutex);
 574                         GOTO(out_openerr, rc);
 575                 }
 576         } else {
 577                 LASSERT(*och_usecount == 0);
 578                 if (!it->it_disposition) {
 579                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 580                         /* We cannot just request lock handle now, new ELC code
 581                            means that one of other OPEN locks for this file
 582                            could be cancelled, and since blocking ast handler
 583                            would attempt to grab och_mutex as well, that would
 584                            result in a deadlock */
 585                         mutex_unlock(&lli->lli_och_mutex);
 586                         /*
 587                          * Normally called under two situations:
 588                          * 1. NFS export.
 589                          * 2. A race/condition on MDS resulting in no open
 590                          *    handle to be returned from LOOKUP|OPEN request,
 591                          *    for example if the target entry was a symlink.
 592                          *
 593                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 594                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 595                          *  bit so that it's not confusing later callers.
 596                          *
 597                          *  NB; when ldd is NULL, it must have come via normal
 598                          *  lookup path only, since ll_iget_for_nfs always calls
 599                          *  ll_d_init().
 600                          */
 601                         if (ldd && ldd->lld_nfs_dentry) {
 602                                 ldd->lld_nfs_dentry = 0;
 603                                 it->it_flags |= MDS_OPEN_LOCK;
 604                         }
 605
 606                          /*
 607                          * Always specify MDS_OPEN_BY_FID because we don't want
 608                          * to get file with different fid.
 609                          */
 610                         it->it_flags |= MDS_OPEN_BY_FID;
 611                         rc = ll_intent_file_open(file, NULL, 0, it);
 612                         if (rc)
 613                                 GOTO(out_openerr, rc);
 614
 615                         goto restart;
 616                 }
 617                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 618                 if (!*och_p)
 619                         GOTO(out_och_free, rc = -ENOMEM);
 620
 621                 (*och_usecount)++;
 622
 623                 /* md_intent_lock() didn't get a request ref if there was an
 624                  * open error, so don't do cleanup on the request here
 625                  * (bug 3430) */
 626                 /* XXX (green): Should not we bail out on any error here, not
 627                  * just open error? */
 628                 rc = it_open_error(DISP_OPEN_OPEN, it);
 629                 if (rc != 0)
 630                         GOTO(out_och_free, rc);
 631
 632                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 633                          "inode %p: disposition %x, status %d\n", inode,
 634                          it_disposition(it, ~0), it->it_status);
 635
 636                 rc = ll_local_open(file, it, fd, *och_p);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639         }
 640         mutex_unlock(&lli->lli_och_mutex);
 641         fd = NULL;
 642
 643         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 644            different kind of OPEN lock for this same inode gets cancelled
 645            by ldlm_cancel_lru */
 646         if (!S_ISREG(inode->i_mode))
 647                 GOTO(out_och_free, rc);
 648
 649         cl_lov_delay_create_clear(&file->f_flags);
 650         GOTO(out_och_free, rc);
 651
 652 out_och_free:
 653         if (rc) {
 654                 if (och_p && *och_p) {
 655                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 656                         *och_p = NULL; /* OBD_FREE writes some magic there */
 657                         (*och_usecount)--;
 658                 }
 659                 mutex_unlock(&lli->lli_och_mutex);
 660
 661 out_openerr:
 662                 if (lli->lli_opendir_key == fd)
 663                         ll_deauthorize_statahead(inode, fd);
 664                 if (fd != NULL)
 665                         ll_file_data_put(fd);
 666         } else {
 667                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 668         }
 669
 670         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 671                 ptlrpc_req_finished(it->it_request);
 672                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 673         }
 674
 675         return rc;
 676 }
 677
 678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 679                         struct ldlm_lock_desc *desc, void *data, int flag)
 680 {
 681         int rc;
 682         struct lustre_handle lockh;
 683         ENTRY;
 684
 685         switch (flag) {
 686         case LDLM_CB_BLOCKING:
 687                 ldlm_lock2handle(lock, &lockh);
 688                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 689                 if (rc < 0) {
 690                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 691                         RETURN(rc);
 692                 }
 693                 break;
 694         case LDLM_CB_CANCELING:
 695                 /* do nothing */
 696                 break;
 697         }
 698         RETURN(0);
 699 }
 700
 701 /**
 702  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 703  * and save it as fd->fd_och so as to force client to reopen the file even
 704  * if it has an open lock in cache already.
 705  */
 706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 707                                 struct lustre_handle *old_handle)
 708 {
 709         struct ll_inode_info *lli = ll_i2info(inode);
 710         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 711         struct obd_client_handle **och_p;
 712         __u64 *och_usecount;
 713         int rc = 0;
 714         ENTRY;
 715
 716         /* Get the openhandle of the file */
 717         mutex_lock(&lli->lli_och_mutex);
 718         if (fd->fd_lease_och != NULL)
 719                 GOTO(out_unlock, rc = -EBUSY);
 720
 721         if (fd->fd_och == NULL) {
 722                 if (file->f_mode & FMODE_WRITE) {
 723                         LASSERT(lli->lli_mds_write_och != NULL);
 724                         och_p = &lli->lli_mds_write_och;
 725                         och_usecount = &lli->lli_open_fd_write_count;
 726                 } else {
 727                         LASSERT(lli->lli_mds_read_och != NULL);
 728                         och_p = &lli->lli_mds_read_och;
 729                         och_usecount = &lli->lli_open_fd_read_count;
 730                 }
 731
 732                 if (*och_usecount > 1)
 733                         GOTO(out_unlock, rc = -EBUSY);
 734
 735                 fd->fd_och = *och_p;
 736                 *och_usecount = 0;
 737                 *och_p = NULL;
 738         }
 739
 740         *old_handle = fd->fd_och->och_fh;
 741
 742         EXIT;
 743 out_unlock:
 744         mutex_unlock(&lli->lli_och_mutex);
 745         return rc;
 746 }
 747
 748 /**
 749  * Release ownership on lli_mds_*_och when putting back a file lease.
 750  */
 751 static int ll_lease_och_release(struct inode *inode, struct file *file)
 752 {
 753         struct ll_inode_info *lli = ll_i2info(inode);
 754         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 755         struct obd_client_handle **och_p;
 756         struct obd_client_handle *old_och = NULL;
 757         __u64 *och_usecount;
 758         int rc = 0;
 759         ENTRY;
 760
 761         mutex_lock(&lli->lli_och_mutex);
 762         if (file->f_mode & FMODE_WRITE) {
 763                 och_p = &lli->lli_mds_write_och;
 764                 och_usecount = &lli->lli_open_fd_write_count;
 765         } else {
 766                 och_p = &lli->lli_mds_read_och;
 767                 och_usecount = &lli->lli_open_fd_read_count;
 768         }
 769
 770         /* The file may have been open by another process (broken lease) so
 771          * *och_p is not NULL. In this case we should simply increase usecount
 772          * and close fd_och.
 773          */
 774         if (*och_p != NULL) {
 775                 old_och = fd->fd_och;
 776                 (*och_usecount)++;
 777         } else {
 778                 *och_p = fd->fd_och;
 779                 *och_usecount = 1;
 780         }
 781         fd->fd_och = NULL;
 782         mutex_unlock(&lli->lli_och_mutex);
 783
 784         if (old_och != NULL)
 785                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 786
 787         RETURN(rc);
 788 }
 789
 790 /**
 791  * Acquire a lease and open the file.
 792  */
 793 static struct obd_client_handle *
 794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 795               __u64 open_flags)
 796 {
 797         struct lookup_intent it = { .it_op = IT_OPEN };
 798         struct ll_sb_info *sbi = ll_i2sbi(inode);
 799         struct md_op_data *op_data;
 800         struct ptlrpc_request *req = NULL;
 801         struct lustre_handle old_handle = { 0 };
 802         struct obd_client_handle *och = NULL;
 803         int rc;
 804         int rc2;
 805         ENTRY;
 806
 807         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 808                 RETURN(ERR_PTR(-EINVAL));
 809
 810         if (file != NULL) {
 811                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 812                         RETURN(ERR_PTR(-EPERM));
 813
 814                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 815                 if (rc)
 816                         RETURN(ERR_PTR(rc));
 817         }
 818
 819         OBD_ALLOC_PTR(och);
 820         if (och == NULL)
 821                 RETURN(ERR_PTR(-ENOMEM));
 822
 823         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 824                                         LUSTRE_OPC_ANY, NULL);
 825         if (IS_ERR(op_data))
 826                 GOTO(out, rc = PTR_ERR(op_data));
 827
 828         /* To tell the MDT this openhandle is from the same owner */
 829         op_data->op_handle = old_handle;
 830
 831         it.it_flags = fmode | open_flags;
 832         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 833         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 834                             &ll_md_blocking_lease_ast,
 835         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 836          * it can be cancelled which may mislead applications that the lease is
 837          * broken;
 838          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 839          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 840          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 841                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 842         ll_finish_md_op_data(op_data);
 843         ptlrpc_req_finished(req);
 844         if (rc < 0)
 845                 GOTO(out_release_it, rc);
 846
 847         if (it_disposition(&it, DISP_LOOKUP_NEG))
 848                 GOTO(out_release_it, rc = -ENOENT);
 849
 850         rc = it_open_error(DISP_OPEN_OPEN, &it);
 851         if (rc)
 852                 GOTO(out_release_it, rc);
 853
 854         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 855         ll_och_fill(sbi->ll_md_exp, &it, och);
 856
 857         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 858                 GOTO(out_close, rc = -EOPNOTSUPP);
 859
 860         /* already get lease, handle lease lock */
 861         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 862         if (it.it_lock_mode == 0 ||
 863             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 864                 /* open lock must return for lease */
 865                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 866                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 867                         it.it_lock_bits);
 868                 GOTO(out_close, rc = -EPROTO);
 869         }
 870
 871         ll_intent_release(&it);
 872         RETURN(och);
 873
 874 out_close:
 875         /* Cancel open lock */
 876         if (it.it_lock_mode != 0) {
 877                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 878                                             it.it_lock_mode);
 879                 it.it_lock_mode = 0;
 880                 och->och_lease_handle.cookie = 0ULL;
 881         }
 882         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 883         if (rc2 < 0)
 884                 CERROR("%s: error closing file "DFID": %d\n",
 885                        ll_get_fsname(inode->i_sb, NULL, 0),
 886                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 887         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 888 out_release_it:
 889         ll_intent_release(&it);
 890 out:
 891         if (och != NULL)
 892                 OBD_FREE_PTR(och);
 893         RETURN(ERR_PTR(rc));
 894 }
 895
 896 /**
 897  * Check whether a layout swap can be done between two inodes.
 898  *
 899  * \param[in] inode1  First inode to check
 900  * \param[in] inode2  Second inode to check
 901  *
 902  * \retval 0 on success, layout swap can be performed between both inodes
 903  * \retval negative error code if requirements are not met
 904  */
 905 static int ll_check_swap_layouts_validity(struct inode *inode1,
 906                                           struct inode *inode2)
 907 {
 908         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 909                 return -EINVAL;
 910
 911         if (inode_permission(inode1, MAY_WRITE) ||
 912             inode_permission(inode2, MAY_WRITE))
 913                 return -EPERM;
 914
 915         if (inode1->i_sb != inode2->i_sb)
 916                 return -EXDEV;
 917
 918         return 0;
 919 }
 920
 921 static int ll_swap_layouts_close(struct obd_client_handle *och,
 922                                  struct inode *inode, struct inode *inode2)
 923 {
 924         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 925         const struct lu_fid     *fid2;
 926         int                      rc;
 927         ENTRY;
 928
 929         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 930                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 931
 932         rc = ll_check_swap_layouts_validity(inode, inode2);
 933         if (rc < 0)
 934                 GOTO(out_free_och, rc);
 935
 936         /* We now know that inode2 is a lustre inode */
 937         fid2 = ll_inode2fid(inode2);
 938
 939         rc = lu_fid_cmp(fid1, fid2);
 940         if (rc == 0)
 941                 GOTO(out_free_och, rc = -EINVAL);
 942
 943         /* Close the file and swap layouts between inode & inode2.
 944          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 945          * because we still need it to pack l_remote_handle to MDT. */
 946         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 947                                        inode2);
 948
 949         och = NULL; /* freed in ll_close_inode_openhandle() */
 950
 951 out_free_och:
 952         if (och != NULL)
 953                 OBD_FREE_PTR(och);
 954
 955         RETURN(rc);
 956 }
 957
 958 /**
 959  * Release lease and close the file.
 960  * It will check if the lease has ever broken.
 961  */
 962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 963                           bool *lease_broken)
 964 {
 965         struct ldlm_lock *lock;
 966         bool cancelled = true;
 967         int rc;
 968         ENTRY;
 969
 970         lock = ldlm_handle2lock(&och->och_lease_handle);
 971         if (lock != NULL) {
 972                 lock_res_and_lock(lock);
 973                 cancelled = ldlm_is_cancel(lock);
 974                 unlock_res_and_lock(lock);
 975                 LDLM_LOCK_PUT(lock);
 976         }
 977
 978         CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 979                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 980
 981         if (!cancelled)
 982                 ldlm_cli_cancel(&och->och_lease_handle, 0);
 983
 984         if (lease_broken != NULL)
 985                 *lease_broken = cancelled;
 986
 987         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 988         RETURN(rc);
 989 }
 990
 991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 992 {
 993         struct ll_inode_info *lli = ll_i2info(inode);
 994         struct cl_object *obj = lli->lli_clob;
 995         struct cl_attr *attr = vvp_env_thread_attr(env);
 996         s64 atime;
 997         s64 mtime;
 998         s64 ctime;
 999         int rc = 0;
1000
1001         ENTRY;
1002
1003         ll_inode_size_lock(inode);
1004
1005         /* Merge timestamps the most recently obtained from MDS with
1006          * timestamps obtained from OSTs.
1007          *
1008          * Do not overwrite atime of inode because it may be refreshed
1009          * by file_accessed() function. If the read was served by cache
1010          * data, there is no RPC to be sent so that atime may not be
1011          * transferred to OSTs at all. MDT only updates atime at close time
1012          * if it's at least 'mdd.*.atime_diff' older.
1013          * All in all, the atime in Lustre does not strictly comply with
1014          * POSIX. Solving this problem needs to send an RPC to MDT for each
1015          * read, this will hurt performance. */
1016         if (LTIME_S(inode->i_atime) < lli->lli_atime)
1017                 LTIME_S(inode->i_atime) = lli->lli_atime;
1018         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1019         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1020
1021         atime = LTIME_S(inode->i_atime);
1022         mtime = LTIME_S(inode->i_mtime);
1023         ctime = LTIME_S(inode->i_ctime);
1024
1025         cl_object_attr_lock(obj);
1026         rc = cl_object_attr_get(env, obj, attr);
1027         cl_object_attr_unlock(obj);
1028
1029         if (rc != 0)
1030                 GOTO(out_size_unlock, rc);
1031
1032         if (atime < attr->cat_atime)
1033                 atime = attr->cat_atime;
1034
1035         if (ctime < attr->cat_ctime)
1036                 ctime = attr->cat_ctime;
1037
1038         if (mtime < attr->cat_mtime)
1039                 mtime = attr->cat_mtime;
1040
1041         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1042                PFID(&lli->lli_fid), attr->cat_size);
1043
1044         i_size_write(inode, attr->cat_size);
1045         inode->i_blocks = attr->cat_blocks;
1046
1047         LTIME_S(inode->i_atime) = atime;
1048         LTIME_S(inode->i_mtime) = mtime;
1049         LTIME_S(inode->i_ctime) = ctime;
1050
1051 out_size_unlock:
1052         ll_inode_size_unlock(inode);
1053
1054         RETURN(rc);
1055 }
1056
1057 static bool file_is_noatime(const struct file *file)
1058 {
1059         const struct vfsmount *mnt = file->f_path.mnt;
1060         const struct inode *inode = file_inode((struct file *)file);
1061
1062         /* Adapted from file_accessed() and touch_atime().*/
1063         if (file->f_flags & O_NOATIME)
1064                 return true;
1065
1066         if (inode->i_flags & S_NOATIME)
1067                 return true;
1068
1069         if (IS_NOATIME(inode))
1070                 return true;
1071
1072         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1073                 return true;
1074
1075         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1076                 return true;
1077
1078         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1079                 return true;
1080
1081         return false;
1082 }
1083
1084 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1085 {
1086         struct inode *inode = file_inode((struct file *)file);
1087
1088         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1089         if (write) {
1090                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1091                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1092                                       file->f_flags & O_DIRECT ||
1093                                       IS_SYNC(inode);
1094         }
1095         io->ci_obj     = ll_i2info(inode)->lli_clob;
1096         io->ci_lockreq = CILR_MAYBE;
1097         if (ll_file_nolock(file)) {
1098                 io->ci_lockreq = CILR_NEVER;
1099                 io->ci_no_srvlock = 1;
1100         } else if (file->f_flags & O_APPEND) {
1101                 io->ci_lockreq = CILR_MANDATORY;
1102         }
1103
1104         io->ci_noatime = file_is_noatime(file);
1105 }
1106
1107 static ssize_t
1108 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1109                    struct file *file, enum cl_io_type iot,
1110                    loff_t *ppos, size_t count)
1111 {
1112         struct vvp_io           *vio = vvp_env_io(env);
1113         struct inode            *inode = file_inode(file);
1114         struct ll_inode_info    *lli = ll_i2info(inode);
1115         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1116         struct cl_io            *io;
1117         ssize_t                 result = 0;
1118         int                     rc = 0;
1119         struct range_lock       range;
1120
1121         ENTRY;
1122
1123         CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1124                 file_dentry(file)->d_name.name, iot, *ppos, count);
1125
1126 restart:
1127         io = vvp_env_thread_io(env);
1128         ll_io_init(io, file, iot == CIT_WRITE);
1129
1130         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1131                 bool range_locked = false;
1132
1133                 if (file->f_flags & O_APPEND)
1134                         range_lock_init(&range, 0, LUSTRE_EOF);
1135                 else
1136                         range_lock_init(&range, *ppos, *ppos + count - 1);
1137
1138                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1139                 vio->vui_io_subtype = args->via_io_subtype;
1140
1141                 switch (vio->vui_io_subtype) {
1142                 case IO_NORMAL:
1143                         vio->vui_iter = args->u.normal.via_iter;
1144                         vio->vui_iocb = args->u.normal.via_iocb;
1145                         /* Direct IO reads must also take range lock,
1146                          * or multiple reads will try to work on the same pages
1147                          * See LU-6227 for details. */
1148                         if (((iot == CIT_WRITE) ||
1149                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1150                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1151                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1152                                        RL_PARA(&range));
1153                                 rc = range_lock(&lli->lli_write_tree, &range);
1154                                 if (rc < 0)
1155                                         GOTO(out, rc);
1156
1157                                 range_locked = true;
1158                         }
1159                         break;
1160                 case IO_SPLICE:
1161                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1162                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1163                         break;
1164                 default:
1165                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1166                         LBUG();
1167                 }
1168
1169                 ll_cl_add(file, env, io, LCC_RW);
1170                 rc = cl_io_loop(env, io);
1171                 ll_cl_remove(file, env);
1172
1173                 if (range_locked) {
1174                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1175                                RL_PARA(&range));
1176                         range_unlock(&lli->lli_write_tree, &range);
1177                 }
1178         } else {
1179                 /* cl_io_rw_init() handled IO */
1180                 rc = io->ci_result;
1181         }
1182
1183         if (io->ci_nob > 0) {
1184                 result += io->ci_nob;
1185                 count -= io->ci_nob;
1186                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1187
1188                 /* prepare IO restart */
1189                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1190                         args->u.normal.via_iter = vio->vui_iter;
1191         }
1192         GOTO(out, rc);
1193 out:
1194         cl_io_fini(env, io);
1195
1196         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1197                 CDEBUG(D_VFSTRACE,
1198                        "%s: restart %s from %lld, count:%zu, result: %zd\n",
1199                        file_dentry(file)->d_name.name,
1200                        iot == CIT_READ ? "read" : "write",
1201                        *ppos, count, result);
1202                 goto restart;
1203         }
1204
1205         if (iot == CIT_READ) {
1206                 if (result > 0)
1207                         ll_stats_ops_tally(ll_i2sbi(inode),
1208                                            LPROC_LL_READ_BYTES, result);
1209         } else if (iot == CIT_WRITE) {
1210                 if (result > 0) {
1211                         ll_stats_ops_tally(ll_i2sbi(inode),
1212                                            LPROC_LL_WRITE_BYTES, result);
1213                         fd->fd_write_failed = false;
1214                 } else if (result == 0 && rc == 0) {
1215                         rc = io->ci_result;
1216                         if (rc < 0)
1217                                 fd->fd_write_failed = true;
1218                         else
1219                                 fd->fd_write_failed = false;
1220                 } else if (rc != -ERESTARTSYS) {
1221                         fd->fd_write_failed = true;
1222                 }
1223         }
1224
1225         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1226
1227         return result > 0 ? result : rc;
1228 }
1229
1230 /**
1231  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1232  * especially for small I/O.
1233  *
1234  * To serve a read request, CLIO has to create and initialize a cl_io and
1235  * then request DLM lock. This has turned out to have siginificant overhead
1236  * and affects the performance of small I/O dramatically.
1237  *
1238  * It's not necessary to create a cl_io for each I/O. Under the help of read
1239  * ahead, most of the pages being read are already in memory cache and we can
1240  * read those pages directly because if the pages exist, the corresponding DLM
1241  * lock must exist so that page content must be valid.
1242  *
1243  * In fast read implementation, the llite speculatively finds and reads pages
1244  * in memory cache. There are three scenarios for fast read:
1245  *   - If the page exists and is uptodate, kernel VM will provide the data and
1246  *     CLIO won't be intervened;
1247  *   - If the page was brought into memory by read ahead, it will be exported
1248  *     and read ahead parameters will be updated;
1249  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1250  *     it will go back and invoke normal read, i.e., a cl_io will be created
1251  *     and DLM lock will be requested.
1252  *
1253  * POSIX compliance: posix standard states that read is intended to be atomic.
1254  * Lustre read implementation is in line with Linux kernel read implementation
1255  * and neither of them complies with POSIX standard in this matter. Fast read
1256  * doesn't make the situation worse on single node but it may interleave write
1257  * results from multiple nodes due to short read handling in ll_file_aio_read().
1258  *
1259  * \param env - lu_env
1260  * \param iocb - kiocb from kernel
1261  * \param iter - user space buffers where the data will be copied
1262  *
1263  * \retval - number of bytes have been read, or error code if error occurred.
1264  */
1265 static ssize_t
1266 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1267                 struct iov_iter *iter)
1268 {
1269         ssize_t result;
1270
1271         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1272                 return 0;
1273
1274         /* NB: we can't do direct IO for fast read because it will need a lock
1275          * to make IO engine happy. */
1276         if (iocb->ki_filp->f_flags & O_DIRECT)
1277                 return 0;
1278
1279         ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1280         result = generic_file_read_iter(iocb, iter);
1281         ll_cl_remove(iocb->ki_filp, env);
1282
1283         /* If the first page is not in cache, generic_file_aio_read() will be
1284          * returned with -ENODATA.
1285          * See corresponding code in ll_readpage(). */
1286         if (result == -ENODATA)
1287                 result = 0;
1288
1289         if (result > 0)
1290                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1291                                 LPROC_LL_READ_BYTES, result);
1292
1293         return result;
1294 }
1295
1296 /*
1297  * Read from a file (through the page cache).
1298  */
1299 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1300 {
1301         struct lu_env *env;
1302         struct vvp_io_args *args;
1303         ssize_t result;
1304         ssize_t rc2;
1305         __u16 refcheck;
1306
1307         env = cl_env_get(&refcheck);
1308         if (IS_ERR(env))
1309                 return PTR_ERR(env);
1310
1311         result = ll_do_fast_read(env, iocb, to);
1312         if (result < 0 || iov_iter_count(to) == 0)
1313                 GOTO(out, result);
1314
1315         args = ll_env_args(env, IO_NORMAL);
1316         args->u.normal.via_iter = to;
1317         args->u.normal.via_iocb = iocb;
1318
1319         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1320                                  &iocb->ki_pos, iov_iter_count(to));
1321         if (rc2 > 0)
1322                 result += rc2;
1323         else if (result == 0)
1324                 result = rc2;
1325
1326 out:
1327         cl_env_put(env, &refcheck);
1328         return result;
1329 }
1330
1331 /*
1332  * Write to a file (through the page cache).
1333  */
1334 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1335 {
1336         struct vvp_io_args *args;
1337         struct lu_env *env;
1338         ssize_t result;
1339         __u16 refcheck;
1340
1341         env = cl_env_get(&refcheck);
1342         if (IS_ERR(env))
1343                 return PTR_ERR(env);
1344
1345         args = ll_env_args(env, IO_NORMAL);
1346         args->u.normal.via_iter = from;
1347         args->u.normal.via_iocb = iocb;
1348
1349         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1350                                     &iocb->ki_pos, iov_iter_count(from));
1351         cl_env_put(env, &refcheck);
1352         return result;
1353 }
1354
1355 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1356 /*
1357  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1358  */
1359 static int ll_file_get_iov_count(const struct iovec *iov,
1360                                  unsigned long *nr_segs, size_t *count)
1361 {
1362         size_t cnt = 0;
1363         unsigned long seg;
1364
1365         for (seg = 0; seg < *nr_segs; seg++) {
1366                 const struct iovec *iv = &iov[seg];
1367
1368                 /*
1369                  * If any segment has a negative length, or the cumulative
1370                  * length ever wraps negative then return -EINVAL.
1371                  */
1372                 cnt += iv->iov_len;
1373                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1374                         return -EINVAL;
1375                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1376                         continue;
1377                 if (seg == 0)
1378                         return -EFAULT;
1379                 *nr_segs = seg;
1380                 cnt -= iv->iov_len;     /* This segment is no good */
1381                 break;
1382         }
1383         *count = cnt;
1384         return 0;
1385 }
1386
1387 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1388                                 unsigned long nr_segs, loff_t pos)
1389 {
1390         struct iov_iter to;
1391         size_t iov_count;
1392         ssize_t result;
1393         ENTRY;
1394
1395         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1396         if (result)
1397                 RETURN(result);
1398
1399 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1400         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1401 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1402         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1403 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1404
1405         result = ll_file_read_iter(iocb, &to);
1406
1407         RETURN(result);
1408 }
1409
1410 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1411                             loff_t *ppos)
1412 {
1413         struct lu_env *env;
1414         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1415         struct kiocb  *kiocb;
1416         ssize_t        result;
1417         __u16          refcheck;
1418         ENTRY;
1419
1420         env = cl_env_get(&refcheck);
1421         if (IS_ERR(env))
1422                 RETURN(PTR_ERR(env));
1423
1424         kiocb = &ll_env_info(env)->lti_kiocb;
1425         init_sync_kiocb(kiocb, file);
1426         kiocb->ki_pos = *ppos;
1427 #ifdef HAVE_KIOCB_KI_LEFT
1428         kiocb->ki_left = count;
1429 #elif defined(HAVE_KI_NBYTES)
1430         kiocb->ki_nbytes = count;
1431 #endif
1432
1433         result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1434         *ppos = kiocb->ki_pos;
1435
1436         cl_env_put(env, &refcheck);
1437         RETURN(result);
1438 }
1439
1440 /*
1441  * Write to a file (through the page cache).
1442  * AIO stuff
1443  */
1444 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1445                                  unsigned long nr_segs, loff_t pos)
1446 {
1447         struct iov_iter from;
1448         size_t iov_count;
1449         ssize_t result;
1450         ENTRY;
1451
1452         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1453         if (result)
1454                 RETURN(result);
1455
1456 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1457         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1458 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1459         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1460 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1461
1462         result = ll_file_write_iter(iocb, &from);
1463
1464         RETURN(result);
1465 }
1466
1467 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1468                              size_t count, loff_t *ppos)
1469 {
1470         struct lu_env *env;
1471         struct iovec   iov = { .iov_base = (void __user *)buf,
1472                                .iov_len = count };
1473         struct kiocb  *kiocb;
1474         ssize_t        result;
1475         __u16          refcheck;
1476         ENTRY;
1477
1478         env = cl_env_get(&refcheck);
1479         if (IS_ERR(env))
1480                 RETURN(PTR_ERR(env));
1481
1482         kiocb = &ll_env_info(env)->lti_kiocb;
1483         init_sync_kiocb(kiocb, file);
1484         kiocb->ki_pos = *ppos;
1485 #ifdef HAVE_KIOCB_KI_LEFT
1486         kiocb->ki_left = count;
1487 #elif defined(HAVE_KI_NBYTES)
1488         kiocb->ki_nbytes = count;
1489 #endif
1490
1491         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1492         *ppos = kiocb->ki_pos;
1493
1494         cl_env_put(env, &refcheck);
1495         RETURN(result);
1496 }
1497 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1498
1499 /*
1500  * Send file content (through pagecache) somewhere with helper
1501  */
1502 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1503                                    struct pipe_inode_info *pipe, size_t count,
1504                                    unsigned int flags)
1505 {
1506         struct lu_env      *env;
1507         struct vvp_io_args *args;
1508         ssize_t             result;
1509         __u16               refcheck;
1510         ENTRY;
1511
1512         env = cl_env_get(&refcheck);
1513         if (IS_ERR(env))
1514                 RETURN(PTR_ERR(env));
1515
1516         args = ll_env_args(env, IO_SPLICE);
1517         args->u.splice.via_pipe = pipe;
1518         args->u.splice.via_flags = flags;
1519
1520         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1521         cl_env_put(env, &refcheck);
1522         RETURN(result);
1523 }
1524
1525 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1526                              __u64  flags, struct lov_user_md *lum,
1527                              int lum_size)
1528 {
1529         struct lookup_intent oit = {
1530                 .it_op = IT_OPEN,
1531                 .it_flags = flags | MDS_OPEN_BY_FID,
1532         };
1533         int rc;
1534         ENTRY;
1535
1536         ll_inode_size_lock(inode);
1537         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1538         if (rc < 0)
1539                 GOTO(out_unlock, rc);
1540
1541         ll_release_openhandle(file_dentry(file), &oit);
1542
1543 out_unlock:
1544         ll_inode_size_unlock(inode);
1545         ll_intent_release(&oit);
1546         cl_lov_delay_create_clear(&file->f_flags);
1547
1548         RETURN(rc);
1549 }
1550
1551 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1552                              struct lov_mds_md **lmmp, int *lmm_size,
1553                              struct ptlrpc_request **request)
1554 {
1555         struct ll_sb_info *sbi = ll_i2sbi(inode);
1556         struct mdt_body  *body;
1557         struct lov_mds_md *lmm = NULL;
1558         struct ptlrpc_request *req = NULL;
1559         struct md_op_data *op_data;
1560         int rc, lmmsize;
1561
1562         rc = ll_get_default_mdsize(sbi, &lmmsize);
1563         if (rc)
1564                 RETURN(rc);
1565
1566         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1567                                      strlen(filename), lmmsize,
1568                                      LUSTRE_OPC_ANY, NULL);
1569         if (IS_ERR(op_data))
1570                 RETURN(PTR_ERR(op_data));
1571
1572         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1573         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1574         ll_finish_md_op_data(op_data);
1575         if (rc < 0) {
1576                 CDEBUG(D_INFO, "md_getattr_name failed "
1577                        "on %s: rc %d\n", filename, rc);
1578                 GOTO(out, rc);
1579         }
1580
1581         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1582         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1583
1584         lmmsize = body->mbo_eadatasize;
1585
1586         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1587                         lmmsize == 0) {
1588                 GOTO(out, rc = -ENODATA);
1589         }
1590
1591         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1592         LASSERT(lmm != NULL);
1593
1594         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1595             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1596                 GOTO(out, rc = -EPROTO);
1597         }
1598
1599         /*
1600          * This is coming from the MDS, so is probably in
1601          * little endian.  We convert it to host endian before
1602          * passing it to userspace.
1603          */
1604         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1605                 int stripe_count;
1606
1607                 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1608                 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1609                         stripe_count = 0;
1610
1611                 /* if function called for directory - we should
1612                  * avoid swab not existent lsm objects */
1613                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1614                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1615                         if (S_ISREG(body->mbo_mode))
1616                                 lustre_swab_lov_user_md_objects(
1617                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1618                                     stripe_count);
1619                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1620                         lustre_swab_lov_user_md_v3(
1621                                 (struct lov_user_md_v3 *)lmm);
1622                         if (S_ISREG(body->mbo_mode))
1623                                 lustre_swab_lov_user_md_objects(
1624                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1625                                  stripe_count);
1626                 }
1627         }
1628
1629 out:
1630         *lmmp = lmm;
1631         *lmm_size = lmmsize;
1632         *request = req;
1633         return rc;
1634 }
1635
1636 static int ll_lov_setea(struct inode *inode, struct file *file,
1637                             unsigned long arg)
1638 {
1639         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1640         struct lov_user_md      *lump;
1641         int                      lum_size = sizeof(struct lov_user_md) +
1642                                             sizeof(struct lov_user_ost_data);
1643         int                      rc;
1644         ENTRY;
1645
1646         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1647                 RETURN(-EPERM);
1648
1649         OBD_ALLOC_LARGE(lump, lum_size);
1650         if (lump == NULL)
1651                 RETURN(-ENOMEM);
1652
1653         if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1654                 GOTO(out_lump, rc = -EFAULT);
1655
1656         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1657
1658 out_lump:
1659         OBD_FREE_LARGE(lump, lum_size);
1660         RETURN(rc);
1661 }
1662
1663 static int ll_file_getstripe(struct inode *inode,
1664                              struct lov_user_md __user *lum)
1665 {
1666         struct lu_env   *env;
1667         __u16           refcheck;
1668         int             rc;
1669         ENTRY;
1670
1671         env = cl_env_get(&refcheck);
1672         if (IS_ERR(env))
1673                 RETURN(PTR_ERR(env));
1674
1675         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1676         cl_env_put(env, &refcheck);
1677         RETURN(rc);
1678 }
1679
1680 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1681                             unsigned long arg)
1682 {
1683         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1684         struct lov_user_md        *klum;
1685         int                        lum_size, rc;
1686         __u64                      flags = FMODE_WRITE;
1687         ENTRY;
1688
1689         rc = ll_copy_user_md(lum, &klum);
1690         if (rc < 0)
1691                 RETURN(rc);
1692
1693         lum_size = rc;
1694         rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1695         if (rc == 0) {
1696                 __u32 gen;
1697
1698                 put_user(0, &lum->lmm_stripe_count);
1699
1700                 ll_layout_refresh(inode, &gen);
1701                 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1702         }
1703
1704         OBD_FREE(klum, lum_size);
1705         RETURN(rc);
1706 }
1707
1708 static int
1709 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1710 {
1711         struct ll_inode_info   *lli = ll_i2info(inode);
1712         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1713         struct ll_grouplock     grouplock;
1714         int                     rc;
1715         ENTRY;
1716
1717         if (arg == 0) {
1718                 CWARN("group id for group lock must not be 0\n");
1719                 RETURN(-EINVAL);
1720         }
1721
1722         if (ll_file_nolock(file))
1723                 RETURN(-EOPNOTSUPP);
1724
1725         spin_lock(&lli->lli_lock);
1726         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1727                 CWARN("group lock already existed with gid %lu\n",
1728                       fd->fd_grouplock.lg_gid);
1729                 spin_unlock(&lli->lli_lock);
1730                 RETURN(-EINVAL);
1731         }
1732         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1733         spin_unlock(&lli->lli_lock);
1734
1735         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1736                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1737         if (rc)
1738                 RETURN(rc);
1739
1740         spin_lock(&lli->lli_lock);
1741         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1742                 spin_unlock(&lli->lli_lock);
1743                 CERROR("another thread just won the race\n");
1744                 cl_put_grouplock(&grouplock);
1745                 RETURN(-EINVAL);
1746         }
1747
1748         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1749         fd->fd_grouplock = grouplock;
1750         spin_unlock(&lli->lli_lock);
1751
1752         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1753         RETURN(0);
1754 }
1755
1756 static int ll_put_grouplock(struct inode *inode, struct file *file,
1757                             unsigned long arg)
1758 {
1759         struct ll_inode_info   *lli = ll_i2info(inode);
1760         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1761         struct ll_grouplock     grouplock;
1762         ENTRY;
1763
1764         spin_lock(&lli->lli_lock);
1765         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1766                 spin_unlock(&lli->lli_lock);
1767                 CWARN("no group lock held\n");
1768                 RETURN(-EINVAL);
1769         }
1770
1771         LASSERT(fd->fd_grouplock.lg_lock != NULL);
1772
1773         if (fd->fd_grouplock.lg_gid != arg) {
1774                 CWARN("group lock %lu doesn't match current id %lu\n",
1775                       arg, fd->fd_grouplock.lg_gid);
1776                 spin_unlock(&lli->lli_lock);
1777                 RETURN(-EINVAL);
1778         }
1779
1780         grouplock = fd->fd_grouplock;
1781         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1782         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1783         spin_unlock(&lli->lli_lock);
1784
1785         cl_put_grouplock(&grouplock);
1786         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1787         RETURN(0);
1788 }
1789
1790 /**
1791  * Close inode open handle
1792  *
1793  * \param dentry [in]     dentry which contains the inode
1794  * \param it     [in,out] intent which contains open info and result
1795  *
1796  * \retval 0     success
1797  * \retval <0    failure
1798  */
1799 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1800 {
1801         struct inode *inode = dentry->d_inode;
1802         struct obd_client_handle *och;
1803         int rc;
1804         ENTRY;
1805
1806         LASSERT(inode);
1807
1808         /* Root ? Do nothing. */
1809         if (dentry->d_inode->i_sb->s_root == dentry)
1810                 RETURN(0);
1811
1812         /* No open handle to close? Move away */
1813         if (!it_disposition(it, DISP_OPEN_OPEN))
1814                 RETURN(0);
1815
1816         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1817
1818         OBD_ALLOC(och, sizeof(*och));
1819         if (!och)
1820                 GOTO(out, rc = -ENOMEM);
1821
1822         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1823
1824         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1825 out:
1826         /* this one is in place of ll_file_open */
1827         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1828                 ptlrpc_req_finished(it->it_request);
1829                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1830         }
1831         RETURN(rc);
1832 }
1833
1834 /**
1835  * Get size for inode for which FIEMAP mapping is requested.
1836  * Make the FIEMAP get_info call and returns the result.
1837  * \param fiemap        kernel buffer to hold extens
1838  * \param num_bytes     kernel buffer size
1839  */
1840 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1841                         size_t num_bytes)
1842 {
1843         struct lu_env                   *env;
1844         __u16                           refcheck;
1845         int                             rc = 0;
1846         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
1847         ENTRY;
1848
1849         /* Checks for fiemap flags */
1850         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1851                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1852                 return -EBADR;
1853         }
1854
1855         /* Check for FIEMAP_FLAG_SYNC */
1856         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1857                 rc = filemap_fdatawrite(inode->i_mapping);
1858                 if (rc)
1859                         return rc;
1860         }
1861
1862         env = cl_env_get(&refcheck);
1863         if (IS_ERR(env))
1864                 RETURN(PTR_ERR(env));
1865
1866         if (i_size_read(inode) == 0) {
1867                 rc = ll_glimpse_size(inode);
1868                 if (rc)
1869                         GOTO(out, rc);
1870         }
1871
1872         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1873         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1874         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1875
1876         /* If filesize is 0, then there would be no objects for mapping */
1877         if (fmkey.lfik_oa.o_size == 0) {
1878                 fiemap->fm_mapped_extents = 0;
1879                 GOTO(out, rc = 0);
1880         }
1881
1882         fmkey.lfik_fiemap = *fiemap;
1883
1884         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1885                               &fmkey, fiemap, &num_bytes);
1886 out:
1887         cl_env_put(env, &refcheck);
1888         RETURN(rc);
1889 }
1890
1891 int ll_fid2path(struct inode *inode, void __user *arg)
1892 {
1893         struct obd_export       *exp = ll_i2mdexp(inode);
1894         const struct getinfo_fid2path __user *gfin = arg;
1895         __u32                    pathlen;
1896         struct getinfo_fid2path *gfout;
1897         size_t                   outsize;
1898         int                      rc;
1899
1900         ENTRY;
1901
1902         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1903             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1904                 RETURN(-EPERM);
1905
1906         /* Only need to get the buflen */
1907         if (get_user(pathlen, &gfin->gf_pathlen))
1908                 RETURN(-EFAULT);
1909
1910         if (pathlen > PATH_MAX)
1911                 RETURN(-EINVAL);
1912
1913         outsize = sizeof(*gfout) + pathlen;
1914         OBD_ALLOC(gfout, outsize);
1915         if (gfout == NULL)
1916                 RETURN(-ENOMEM);
1917
1918         if (copy_from_user(gfout, arg, sizeof(*gfout)))
1919                 GOTO(gf_free, rc = -EFAULT);
1920         /* append root FID after gfout to let MDT know the root FID so that it
1921          * can lookup the correct path, this is mainly for fileset.
1922          * old server without fileset mount support will ignore this. */
1923         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1924
1925         /* Call mdc_iocontrol */
1926         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1927         if (rc != 0)
1928                 GOTO(gf_free, rc);
1929
1930         if (copy_to_user(arg, gfout, outsize))
1931                 rc = -EFAULT;
1932
1933 gf_free:
1934         OBD_FREE(gfout, outsize);
1935         RETURN(rc);
1936 }
1937
1938 /*
1939  * Read the data_version for inode.
1940  *
1941  * This value is computed using stripe object version on OST.
1942  * Version is computed using server side locking.
1943  *
1944  * @param flags if do sync on the OST side;
1945  *              0: no sync
1946  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1947  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1948  */
1949 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1950 {
1951         struct cl_object *obj = ll_i2info(inode)->lli_clob;
1952         struct lu_env *env;
1953         struct cl_io *io;
1954         __u16  refcheck;
1955         int result;
1956
1957         ENTRY;
1958
1959         /* If no file object initialized, we consider its version is 0. */
1960         if (obj == NULL) {
1961                 *data_version = 0;
1962                 RETURN(0);
1963         }
1964
1965         env = cl_env_get(&refcheck);
1966         if (IS_ERR(env))
1967                 RETURN(PTR_ERR(env));
1968
1969         io = vvp_env_thread_io(env);
1970         io->ci_obj = obj;
1971         io->u.ci_data_version.dv_data_version = 0;
1972         io->u.ci_data_version.dv_flags = flags;
1973
1974 restart:
1975         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1976                 result = cl_io_loop(env, io);
1977         else
1978                 result = io->ci_result;
1979
1980         *data_version = io->u.ci_data_version.dv_data_version;
1981
1982         cl_io_fini(env, io);
1983
1984         if (unlikely(io->ci_need_restart))
1985                 goto restart;
1986
1987         cl_env_put(env, &refcheck);
1988
1989         RETURN(result);
1990 }
1991
1992 /*
1993  * Trigger a HSM release request for the provided inode.
1994  */
1995 int ll_hsm_release(struct inode *inode)
1996 {
1997         struct lu_env *env;
1998         struct obd_client_handle *och = NULL;
1999         __u64 data_version = 0;
2000         int rc;
2001         __u16 refcheck;
2002         ENTRY;
2003
2004         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2005                ll_get_fsname(inode->i_sb, NULL, 0),
2006                PFID(&ll_i2info(inode)->lli_fid));
2007
2008         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2009         if (IS_ERR(och))
2010                 GOTO(out, rc = PTR_ERR(och));
2011
2012         /* Grab latest data_version and [am]time values */
2013         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2014         if (rc != 0)
2015                 GOTO(out, rc);
2016
2017         env = cl_env_get(&refcheck);
2018         if (IS_ERR(env))
2019                 GOTO(out, rc = PTR_ERR(env));
2020
2021         ll_merge_attr(env, inode);
2022         cl_env_put(env, &refcheck);
2023
2024         /* Release the file.
2025          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2026          * we still need it to pack l_remote_handle to MDT. */
2027         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2028                                        &data_version);
2029         och = NULL;
2030
2031         EXIT;
2032 out:
2033         if (och != NULL && !IS_ERR(och)) /* close the file */
2034                 ll_lease_close(och, inode, NULL);
2035
2036         return rc;
2037 }
2038
2039 struct ll_swap_stack {
2040         __u64                    dv1;
2041         __u64                    dv2;
2042         struct inode            *inode1;
2043         struct inode            *inode2;
2044         bool                     check_dv1;
2045         bool                     check_dv2;
2046 };
2047
2048 static int ll_swap_layouts(struct file *file1, struct file *file2,
2049                            struct lustre_swap_layouts *lsl)
2050 {
2051         struct mdc_swap_layouts  msl;
2052         struct md_op_data       *op_data;
2053         __u32                    gid;
2054         __u64                    dv;
2055         struct ll_swap_stack    *llss = NULL;
2056         int                      rc;
2057
2058         OBD_ALLOC_PTR(llss);
2059         if (llss == NULL)
2060                 RETURN(-ENOMEM);
2061
2062         llss->inode1 = file_inode(file1);
2063         llss->inode2 = file_inode(file2);
2064
2065         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2066         if (rc < 0)
2067                 GOTO(free, rc);
2068
2069         /* we use 2 bool because it is easier to swap than 2 bits */
2070         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2071                 llss->check_dv1 = true;
2072
2073         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2074                 llss->check_dv2 = true;
2075
2076         /* we cannot use lsl->sl_dvX directly because we may swap them */
2077         llss->dv1 = lsl->sl_dv1;
2078         llss->dv2 = lsl->sl_dv2;
2079
2080         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2081         if (rc == 0) /* same file, done! */
2082                 GOTO(free, rc);
2083
2084         if (rc < 0) { /* sequentialize it */
2085                 swap(llss->inode1, llss->inode2);
2086                 swap(file1, file2);
2087                 swap(llss->dv1, llss->dv2);
2088                 swap(llss->check_dv1, llss->check_dv2);
2089         }
2090
2091         gid = lsl->sl_gid;
2092         if (gid != 0) { /* application asks to flush dirty cache */
2093                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2094                 if (rc < 0)
2095                         GOTO(free, rc);
2096
2097                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2098                 if (rc < 0) {
2099                         ll_put_grouplock(llss->inode1, file1, gid);
2100                         GOTO(free, rc);
2101                 }
2102         }
2103
2104         /* ultimate check, before swaping the layouts we check if
2105          * dataversion has changed (if requested) */
2106         if (llss->check_dv1) {
2107                 rc = ll_data_version(llss->inode1, &dv, 0);
2108                 if (rc)
2109                         GOTO(putgl, rc);
2110                 if (dv != llss->dv1)
2111                         GOTO(putgl, rc = -EAGAIN);
2112         }
2113
2114         if (llss->check_dv2) {
2115                 rc = ll_data_version(llss->inode2, &dv, 0);
2116                 if (rc)
2117                         GOTO(putgl, rc);
2118                 if (dv != llss->dv2)
2119                         GOTO(putgl, rc = -EAGAIN);
2120         }
2121
2122         /* struct md_op_data is used to send the swap args to the mdt
2123          * only flags is missing, so we use struct mdc_swap_layouts
2124          * through the md_op_data->op_data */
2125         /* flags from user space have to be converted before they are send to
2126          * server, no flag is sent today, they are only used on the client */
2127         msl.msl_flags = 0;
2128         rc = -ENOMEM;
2129         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2130                                      0, LUSTRE_OPC_ANY, &msl);
2131         if (IS_ERR(op_data))
2132                 GOTO(free, rc = PTR_ERR(op_data));
2133
2134         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2135                            sizeof(*op_data), op_data, NULL);
2136         ll_finish_md_op_data(op_data);
2137
2138         if (rc < 0)
2139                 GOTO(putgl, rc);
2140
2141 putgl:
2142         if (gid != 0) {
2143                 ll_put_grouplock(llss->inode2, file2, gid);
2144                 ll_put_grouplock(llss->inode1, file1, gid);
2145         }
2146
2147 free:
2148         if (llss != NULL)
2149                 OBD_FREE_PTR(llss);
2150
2151         RETURN(rc);
2152 }
2153
2154 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2155 {
2156         struct md_op_data       *op_data;
2157         int                      rc;
2158         ENTRY;
2159
2160         /* Detect out-of range masks */
2161         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2162                 RETURN(-EINVAL);
2163
2164         /* Non-root users are forbidden to set or clear flags which are
2165          * NOT defined in HSM_USER_MASK. */
2166         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2167             !cfs_capable(CFS_CAP_SYS_ADMIN))
2168                 RETURN(-EPERM);
2169
2170         /* Detect out-of range archive id */
2171         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2172             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2173                 RETURN(-EINVAL);
2174
2175         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2176                                      LUSTRE_OPC_ANY, hss);
2177         if (IS_ERR(op_data))
2178                 RETURN(PTR_ERR(op_data));
2179
2180         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2181                            sizeof(*op_data), op_data, NULL);
2182
2183         ll_finish_md_op_data(op_data);
2184
2185         RETURN(rc);
2186 }
2187
2188 static int ll_hsm_import(struct inode *inode, struct file *file,
2189                          struct hsm_user_import *hui)
2190 {
2191         struct hsm_state_set    *hss = NULL;
2192         struct iattr            *attr = NULL;
2193         int                      rc;
2194         ENTRY;
2195
2196         if (!S_ISREG(inode->i_mode))
2197                 RETURN(-EINVAL);
2198
2199         /* set HSM flags */
2200         OBD_ALLOC_PTR(hss);
2201         if (hss == NULL)
2202                 GOTO(out, rc = -ENOMEM);
2203
2204         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2205         hss->hss_archive_id = hui->hui_archive_id;
2206         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2207         rc = ll_hsm_state_set(inode, hss);
2208         if (rc != 0)
2209                 GOTO(out, rc);
2210
2211         OBD_ALLOC_PTR(attr);
2212         if (attr == NULL)
2213                 GOTO(out, rc = -ENOMEM);
2214
2215         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2216         attr->ia_mode |= S_IFREG;
2217         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2218         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2219         attr->ia_size = hui->hui_size;
2220         attr->ia_mtime.tv_sec = hui->hui_mtime;
2221         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2222         attr->ia_atime.tv_sec = hui->hui_atime;
2223         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2224
2225         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2226                          ATTR_UID | ATTR_GID |
2227                          ATTR_MTIME | ATTR_MTIME_SET |
2228                          ATTR_ATIME | ATTR_ATIME_SET;
2229
2230         inode_lock(inode);
2231
2232         rc = ll_setattr_raw(file_dentry(file), attr, true);
2233         if (rc == -ENODATA)
2234                 rc = 0;
2235
2236         inode_unlock(inode);
2237
2238 out:
2239         if (hss != NULL)
2240                 OBD_FREE_PTR(hss);
2241
2242         if (attr != NULL)
2243                 OBD_FREE_PTR(attr);
2244
2245         RETURN(rc);
2246 }
2247
2248 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2249 {
2250         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2251                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2252 }
2253
2254 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2255 {
2256         struct inode *inode = file_inode(file);
2257         struct iattr ia = {
2258                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2259                             ATTR_MTIME | ATTR_MTIME_SET |
2260                             ATTR_CTIME | ATTR_CTIME_SET,
2261                 .ia_atime = {
2262                         .tv_sec = lfu->lfu_atime_sec,
2263                         .tv_nsec = lfu->lfu_atime_nsec,
2264                 },
2265                 .ia_mtime = {
2266                         .tv_sec = lfu->lfu_mtime_sec,
2267                         .tv_nsec = lfu->lfu_mtime_nsec,
2268                 },
2269                 .ia_ctime = {
2270                         .tv_sec = lfu->lfu_ctime_sec,
2271                         .tv_nsec = lfu->lfu_ctime_nsec,
2272                 },
2273         };
2274         int rc;
2275         ENTRY;
2276
2277         if (!capable(CAP_SYS_ADMIN))
2278                 RETURN(-EPERM);
2279
2280         if (!S_ISREG(inode->i_mode))
2281                 RETURN(-EINVAL);
2282
2283         inode_lock(inode);
2284         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2285         inode_unlock(inode);
2286
2287         RETURN(rc);
2288 }
2289
2290 /*
2291  * Give file access advices
2292  *
2293  * The ladvise interface is similar to Linux fadvise() system call, except it
2294  * forwards the advices directly from Lustre client to server. The server side
2295  * codes will apply appropriate read-ahead and caching techniques for the
2296  * corresponding files.
2297  *
2298  * A typical workload for ladvise is e.g. a bunch of different clients are
2299  * doing small random reads of a file, so prefetching pages into OSS cache
2300  * with big linear reads before the random IO is a net benefit. Fetching
2301  * all that data into each client cache with fadvise() may not be, due to
2302  * much more data being sent to the client.
2303  */
2304 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2305                       struct llapi_lu_ladvise *ladvise)
2306 {
2307         struct lu_env *env;
2308         struct cl_io *io;
2309         struct cl_ladvise_io *lio;
2310         int rc;
2311         __u16 refcheck;
2312         ENTRY;
2313
2314         env = cl_env_get(&refcheck);
2315         if (IS_ERR(env))
2316                 RETURN(PTR_ERR(env));
2317
2318         io = vvp_env_thread_io(env);
2319         io->ci_obj = ll_i2info(inode)->lli_clob;
2320
2321         /* initialize parameters for ladvise */
2322         lio = &io->u.ci_ladvise;
2323         lio->li_start = ladvise->lla_start;
2324         lio->li_end = ladvise->lla_end;
2325         lio->li_fid = ll_inode2fid(inode);
2326         lio->li_advice = ladvise->lla_advice;
2327         lio->li_flags = flags;
2328
2329         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2330                 rc = cl_io_loop(env, io);
2331         else
2332                 rc = io->ci_result;
2333
2334         cl_io_fini(env, io);
2335         cl_env_put(env, &refcheck);
2336         RETURN(rc);
2337 }
2338
2339 static long
2340 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2341 {
2342         struct inode            *inode = file_inode(file);
2343         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2344         int                      flags, rc;
2345         ENTRY;
2346
2347         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2348                PFID(ll_inode2fid(inode)), inode, cmd);
2349         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2350
2351         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2352         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2353                 RETURN(-ENOTTY);
2354
2355         switch(cmd) {
2356         case LL_IOC_GETFLAGS:
2357                 /* Get the current value of the file flags */
2358                 return put_user(fd->fd_flags, (int __user *)arg);
2359         case LL_IOC_SETFLAGS:
2360         case LL_IOC_CLRFLAGS:
2361                 /* Set or clear specific file flags */
2362                 /* XXX This probably needs checks to ensure the flags are
2363                  *     not abused, and to handle any flag side effects.
2364                  */
2365                 if (get_user(flags, (int __user *) arg))
2366                         RETURN(-EFAULT);
2367
2368                 if (cmd == LL_IOC_SETFLAGS) {
2369                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2370                             !(file->f_flags & O_DIRECT)) {
2371                                 CERROR("%s: unable to disable locking on "
2372                                        "non-O_DIRECT file\n", current->comm);
2373                                 RETURN(-EINVAL);
2374                         }
2375
2376                         fd->fd_flags |= flags;
2377                 } else {
2378                         fd->fd_flags &= ~flags;
2379                 }
2380                 RETURN(0);
2381         case LL_IOC_LOV_SETSTRIPE:
2382                 RETURN(ll_lov_setstripe(inode, file, arg));
2383         case LL_IOC_LOV_SETEA:
2384                 RETURN(ll_lov_setea(inode, file, arg));
2385         case LL_IOC_LOV_SWAP_LAYOUTS: {
2386                 struct file *file2;
2387                 struct lustre_swap_layouts lsl;
2388
2389                 if (copy_from_user(&lsl, (char __user *)arg,
2390                                        sizeof(struct lustre_swap_layouts)))
2391                         RETURN(-EFAULT);
2392
2393                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2394                         RETURN(-EPERM);
2395
2396                 file2 = fget(lsl.sl_fd);
2397                 if (file2 == NULL)
2398                         RETURN(-EBADF);
2399
2400                 /* O_WRONLY or O_RDWR */
2401                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2402                         GOTO(out, rc = -EPERM);
2403
2404                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2405                         struct inode                    *inode2;
2406                         struct ll_inode_info            *lli;
2407                         struct obd_client_handle        *och = NULL;
2408
2409                         if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2410                                 GOTO(out, rc = -EINVAL);
2411
2412                         lli = ll_i2info(inode);
2413                         mutex_lock(&lli->lli_och_mutex);
2414                         if (fd->fd_lease_och != NULL) {
2415                                 och = fd->fd_lease_och;
2416                                 fd->fd_lease_och = NULL;
2417                         }
2418                         mutex_unlock(&lli->lli_och_mutex);
2419                         if (och == NULL)
2420                                 GOTO(out, rc = -ENOLCK);
2421                         inode2 = file_inode(file2);
2422                         rc = ll_swap_layouts_close(och, inode, inode2);
2423                 } else {
2424                         rc = ll_swap_layouts(file, file2, &lsl);
2425                 }
2426 out:
2427                 fput(file2);
2428                 RETURN(rc);
2429         }
2430         case LL_IOC_LOV_GETSTRIPE:
2431                 RETURN(ll_file_getstripe(inode,
2432                                          (struct lov_user_md __user *)arg));
2433         case FSFILT_IOC_GETFLAGS:
2434         case FSFILT_IOC_SETFLAGS:
2435                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2436         case FSFILT_IOC_GETVERSION_OLD:
2437         case FSFILT_IOC_GETVERSION:
2438                 RETURN(put_user(inode->i_generation, (int __user *)arg));
2439         case LL_IOC_GROUP_LOCK:
2440                 RETURN(ll_get_grouplock(inode, file, arg));
2441         case LL_IOC_GROUP_UNLOCK:
2442                 RETURN(ll_put_grouplock(inode, file, arg));
2443         case IOC_OBD_STATFS:
2444                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2445
2446         /* We need to special case any other ioctls we want to handle,
2447          * to send them to the MDS/OST as appropriate and to properly
2448          * network encode the arg field.
2449         case FSFILT_IOC_SETVERSION_OLD:
2450         case FSFILT_IOC_SETVERSION:
2451         */
2452         case LL_IOC_FLUSHCTX:
2453                 RETURN(ll_flush_ctx(inode));
2454         case LL_IOC_PATH2FID: {
2455                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2456                                  sizeof(struct lu_fid)))
2457                         RETURN(-EFAULT);
2458
2459                 RETURN(0);
2460         }
2461         case LL_IOC_GETPARENT:
2462                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2463
2464         case OBD_IOC_FID2PATH:
2465                 RETURN(ll_fid2path(inode, (void __user *)arg));
2466         case LL_IOC_DATA_VERSION: {
2467                 struct ioc_data_version idv;
2468                 int rc;
2469
2470                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2471                         RETURN(-EFAULT);
2472
2473                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2474                 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2475
2476                 if (rc == 0 &&
2477                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2478                         RETURN(-EFAULT);
2479
2480                 RETURN(rc);
2481         }
2482
2483         case LL_IOC_GET_MDTIDX: {
2484                 int mdtidx;
2485
2486                 mdtidx = ll_get_mdt_idx(inode);
2487                 if (mdtidx < 0)
2488                         RETURN(mdtidx);
2489
2490                 if (put_user((int)mdtidx, (int __user *)arg))
2491                         RETURN(-EFAULT);
2492
2493                 RETURN(0);
2494         }
2495         case OBD_IOC_GETDTNAME:
2496         case OBD_IOC_GETMDNAME:
2497                 RETURN(ll_get_obd_name(inode, cmd, arg));
2498         case LL_IOC_HSM_STATE_GET: {
2499                 struct md_op_data       *op_data;
2500                 struct hsm_user_state   *hus;
2501                 int                      rc;
2502
2503                 OBD_ALLOC_PTR(hus);
2504                 if (hus == NULL)
2505                         RETURN(-ENOMEM);
2506
2507                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2508                                              LUSTRE_OPC_ANY, hus);
2509                 if (IS_ERR(op_data)) {
2510                         OBD_FREE_PTR(hus);
2511                         RETURN(PTR_ERR(op_data));
2512                 }
2513
2514                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2515                                    op_data, NULL);
2516
2517                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2518                         rc = -EFAULT;
2519
2520                 ll_finish_md_op_data(op_data);
2521                 OBD_FREE_PTR(hus);
2522                 RETURN(rc);
2523         }
2524         case LL_IOC_HSM_STATE_SET: {
2525                 struct hsm_state_set    *hss;
2526                 int                      rc;
2527
2528                 OBD_ALLOC_PTR(hss);
2529                 if (hss == NULL)
2530                         RETURN(-ENOMEM);
2531
2532                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2533                         OBD_FREE_PTR(hss);
2534                         RETURN(-EFAULT);
2535                 }
2536
2537                 rc = ll_hsm_state_set(inode, hss);
2538
2539                 OBD_FREE_PTR(hss);
2540                 RETURN(rc);
2541         }
2542         case LL_IOC_HSM_ACTION: {
2543                 struct md_op_data               *op_data;
2544                 struct hsm_current_action       *hca;
2545                 int                              rc;
2546
2547                 OBD_ALLOC_PTR(hca);
2548                 if (hca == NULL)
2549                         RETURN(-ENOMEM);
2550
2551                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2552                                              LUSTRE_OPC_ANY, hca);
2553                 if (IS_ERR(op_data)) {
2554                         OBD_FREE_PTR(hca);
2555                         RETURN(PTR_ERR(op_data));
2556                 }
2557
2558                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2559                                    op_data, NULL);
2560
2561                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2562                         rc = -EFAULT;
2563
2564                 ll_finish_md_op_data(op_data);
2565                 OBD_FREE_PTR(hca);
2566                 RETURN(rc);
2567         }
2568         case LL_IOC_SET_LEASE: {
2569                 struct ll_inode_info *lli = ll_i2info(inode);
2570                 struct obd_client_handle *och = NULL;
2571                 bool lease_broken;
2572                 fmode_t fmode;
2573
2574                 switch (arg) {
2575                 case LL_LEASE_WRLCK:
2576                         if (!(file->f_mode & FMODE_WRITE))
2577                                 RETURN(-EPERM);
2578                         fmode = FMODE_WRITE;
2579                         break;
2580                 case LL_LEASE_RDLCK:
2581                         if (!(file->f_mode & FMODE_READ))
2582                                 RETURN(-EPERM);
2583                         fmode = FMODE_READ;
2584                         break;
2585                 case LL_LEASE_UNLCK:
2586                         mutex_lock(&lli->lli_och_mutex);
2587                         if (fd->fd_lease_och != NULL) {
2588                                 och = fd->fd_lease_och;
2589                                 fd->fd_lease_och = NULL;
2590                         }
2591                         mutex_unlock(&lli->lli_och_mutex);
2592
2593                         if (och == NULL)
2594                                 RETURN(-ENOLCK);
2595
2596                         fmode = och->och_flags;
2597                         rc = ll_lease_close(och, inode, &lease_broken);
2598                         if (rc < 0)
2599                                 RETURN(rc);
2600
2601                         rc = ll_lease_och_release(inode, file);
2602                         if (rc < 0)
2603                                 RETURN(rc);
2604
2605                         if (lease_broken)
2606                                 fmode = 0;
2607
2608                         RETURN(ll_lease_type_from_fmode(fmode));
2609                 default:
2610                         RETURN(-EINVAL);
2611                 }
2612
2613                 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2614
2615                 /* apply for lease */
2616                 och = ll_lease_open(inode, file, fmode, 0);
2617                 if (IS_ERR(och))
2618                         RETURN(PTR_ERR(och));
2619
2620                 rc = 0;
2621                 mutex_lock(&lli->lli_och_mutex);
2622                 if (fd->fd_lease_och == NULL) {
2623                         fd->fd_lease_och = och;
2624                         och = NULL;
2625                 }
2626                 mutex_unlock(&lli->lli_och_mutex);
2627                 if (och != NULL) {
2628                         /* impossible now that only excl is supported for now */
2629                         ll_lease_close(och, inode, &lease_broken);
2630                         rc = -EBUSY;
2631                 }
2632                 RETURN(rc);
2633         }
2634         case LL_IOC_GET_LEASE: {
2635                 struct ll_inode_info *lli = ll_i2info(inode);
2636                 struct ldlm_lock *lock = NULL;
2637                 fmode_t fmode = 0;
2638
2639                 mutex_lock(&lli->lli_och_mutex);
2640                 if (fd->fd_lease_och != NULL) {
2641                         struct obd_client_handle *och = fd->fd_lease_och;
2642
2643                         lock = ldlm_handle2lock(&och->och_lease_handle);
2644                         if (lock != NULL) {
2645                                 lock_res_and_lock(lock);
2646                                 if (!ldlm_is_cancel(lock))
2647                                         fmode = och->och_flags;
2648
2649                                 unlock_res_and_lock(lock);
2650                                 LDLM_LOCK_PUT(lock);
2651                         }
2652                 }
2653                 mutex_unlock(&lli->lli_och_mutex);
2654
2655                 RETURN(ll_lease_type_from_fmode(fmode));
2656         }
2657         case LL_IOC_HSM_IMPORT: {
2658                 struct hsm_user_import *hui;
2659
2660                 OBD_ALLOC_PTR(hui);
2661                 if (hui == NULL)
2662                         RETURN(-ENOMEM);
2663
2664                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2665                         OBD_FREE_PTR(hui);
2666                         RETURN(-EFAULT);
2667                 }
2668
2669                 rc = ll_hsm_import(inode, file, hui);
2670
2671                 OBD_FREE_PTR(hui);
2672                 RETURN(rc);
2673         }
2674         case LL_IOC_FUTIMES_3: {
2675                 struct ll_futimes_3 lfu;
2676
2677                 if (copy_from_user(&lfu,
2678                                    (const struct ll_futimes_3 __user *)arg,
2679                                    sizeof(lfu)))
2680                         RETURN(-EFAULT);
2681
2682                 RETURN(ll_file_futimes_3(file, &lfu));
2683         }
2684         case LL_IOC_LADVISE: {
2685                 struct llapi_ladvise_hdr *ladvise_hdr;
2686                 int i;
2687                 int num_advise;
2688                 int alloc_size = sizeof(*ladvise_hdr);
2689
2690                 rc = 0;
2691                 OBD_ALLOC_PTR(ladvise_hdr);
2692                 if (ladvise_hdr == NULL)
2693                         RETURN(-ENOMEM);
2694
2695                 if (copy_from_user(ladvise_hdr,
2696                                    (const struct llapi_ladvise_hdr __user *)arg,
2697                                    alloc_size))
2698                         GOTO(out_ladvise, rc = -EFAULT);
2699
2700                 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2701                     ladvise_hdr->lah_count < 1)
2702                         GOTO(out_ladvise, rc = -EINVAL);
2703
2704                 num_advise = ladvise_hdr->lah_count;
2705                 if (num_advise >= LAH_COUNT_MAX)
2706                         GOTO(out_ladvise, rc = -EFBIG);
2707
2708                 OBD_FREE_PTR(ladvise_hdr);
2709                 alloc_size = offsetof(typeof(*ladvise_hdr),
2710                                       lah_advise[num_advise]);
2711                 OBD_ALLOC(ladvise_hdr, alloc_size);
2712                 if (ladvise_hdr == NULL)
2713                         RETURN(-ENOMEM);
2714
2715                 /*
2716                  * TODO: submit multiple advices to one server in a single RPC
2717                  */
2718                 if (copy_from_user(ladvise_hdr,
2719                                    (const struct llapi_ladvise_hdr __user *)arg,
2720                                    alloc_size))
2721                         GOTO(out_ladvise, rc = -EFAULT);
2722
2723                 for (i = 0; i < num_advise; i++) {
2724                         rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2725                                         &ladvise_hdr->lah_advise[i]);
2726                         if (rc)
2727                                 break;
2728                 }
2729
2730 out_ladvise:
2731                 OBD_FREE(ladvise_hdr, alloc_size);
2732                 RETURN(rc);
2733         }
2734         default: {
2735                 int err;
2736
2737                 if (LLIOC_STOP ==
2738                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2739                         RETURN(err);
2740
2741                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2742                                      (void __user *)arg));
2743         }
2744         }
2745 }
2746
2747 #ifndef HAVE_FILE_LLSEEK_SIZE
2748 static inline loff_t
2749 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2750 {
2751         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2752                 return -EINVAL;
2753         if (offset > maxsize)
2754                 return -EINVAL;
2755
2756         if (offset != file->f_pos) {
2757                 file->f_pos = offset;
2758                 file->f_version = 0;
2759         }
2760         return offset;
2761 }
2762
2763 static loff_t
2764 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2765                 loff_t maxsize, loff_t eof)
2766 {
2767         struct inode *inode = file_inode(file);
2768
2769         switch (origin) {
2770         case SEEK_END:
2771                 offset += eof;
2772                 break;
2773         case SEEK_CUR:
2774                 /*
2775                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2776                  * position-querying operation.  Avoid rewriting the "same"
2777                  * f_pos value back to the file because a concurrent read(),
2778                  * write() or lseek() might have altered it
2779                  */
2780                 if (offset == 0)
2781                         return file->f_pos;
2782                 /*
2783                  * f_lock protects against read/modify/write race with other
2784                  * SEEK_CURs. Note that parallel writes and reads behave
2785                  * like SEEK_SET.
2786                  */
2787                 inode_lock(inode);
2788                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2789                 inode_unlock(inode);
2790                 return offset;
2791         case SEEK_DATA:
2792                 /*
2793                  * In the generic case the entire file is data, so as long as
2794                  * offset isn't at the end of the file then the offset is data.
2795                  */
2796                 if (offset >= eof)
2797                         return -ENXIO;
2798                 break;
2799         case SEEK_HOLE:
2800                 /*
2801                  * There is a virtual hole at the end of the file, so as long as
2802                  * offset isn't i_size or larger, return i_size.
2803                  */
2804                 if (offset >= eof)
2805                         return -ENXIO;
2806                 offset = eof;
2807                 break;
2808         }
2809
2810         return llseek_execute(file, offset, maxsize);
2811 }
2812 #endif
2813
2814 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2815 {
2816         struct inode *inode = file_inode(file);
2817         loff_t retval, eof = 0;
2818
2819         ENTRY;
2820         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2821                            (origin == SEEK_CUR) ? file->f_pos : 0);
2822         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2823                PFID(ll_inode2fid(inode)), inode, retval, retval,
2824                origin);
2825         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2826
2827         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2828                 retval = ll_glimpse_size(inode);
2829                 if (retval != 0)
2830                         RETURN(retval);
2831                 eof = i_size_read(inode);
2832         }
2833
2834         retval = ll_generic_file_llseek_size(file, offset, origin,
2835                                           ll_file_maxbytes(inode), eof);
2836         RETURN(retval);
2837 }
2838
2839 static int ll_flush(struct file *file, fl_owner_t id)
2840 {
2841         struct inode *inode = file_inode(file);
2842         struct ll_inode_info *lli = ll_i2info(inode);
2843         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2844         int rc, err;
2845
2846         LASSERT(!S_ISDIR(inode->i_mode));
2847
2848         /* catch async errors that were recorded back when async writeback
2849          * failed for pages in this mapping. */
2850         rc = lli->lli_async_rc;
2851         lli->lli_async_rc = 0;
2852         if (lli->lli_clob != NULL) {
2853                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2854                 if (rc == 0)
2855                         rc = err;
2856         }
2857
2858         /* The application has been told write failure already.
2859          * Do not report failure again. */
2860         if (fd->fd_write_failed)
2861                 return 0;
2862         return rc ? -EIO : 0;
2863 }
2864
2865 /**
2866  * Called to make sure a portion of file has been written out.
2867  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2868  *
2869  * Return how many pages have been written.
2870  */
2871 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2872                        enum cl_fsync_mode mode, int ignore_layout)
2873 {
2874         struct lu_env *env;
2875         struct cl_io *io;
2876         struct cl_fsync_io *fio;
2877         int result;
2878         __u16 refcheck;
2879         ENTRY;
2880
2881         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2882             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2883                 RETURN(-EINVAL);
2884
2885         env = cl_env_get(&refcheck);
2886         if (IS_ERR(env))
2887                 RETURN(PTR_ERR(env));
2888
2889         io = vvp_env_thread_io(env);
2890         io->ci_obj = ll_i2info(inode)->lli_clob;
2891         io->ci_ignore_layout = ignore_layout;
2892
2893         /* initialize parameters for sync */
2894         fio = &io->u.ci_fsync;
2895         fio->fi_start = start;
2896         fio->fi_end = end;
2897         fio->fi_fid = ll_inode2fid(inode);
2898         fio->fi_mode = mode;
2899         fio->fi_nr_written = 0;
2900
2901         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2902                 result = cl_io_loop(env, io);
2903         else
2904                 result = io->ci_result;
2905         if (result == 0)
2906                 result = fio->fi_nr_written;
2907         cl_io_fini(env, io);
2908         cl_env_put(env, &refcheck);
2909
2910         RETURN(result);
2911 }
2912
2913 /*
2914  * When dentry is provided (the 'else' case), file_dentry() may be
2915  * null and dentry must be used directly rather than pulled from
2916  * file_dentry() as is done otherwise.
2917  */
2918
2919 #ifdef HAVE_FILE_FSYNC_4ARGS
2920 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2921 {
2922         struct dentry *dentry = file_dentry(file);
2923 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2924 int ll_fsync(struct file *file, int datasync)
2925 {
2926         struct dentry *dentry = file_dentry(file);
2927         loff_t start = 0;
2928         loff_t end = LLONG_MAX;
2929 #else
2930 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2931 {
2932         loff_t start = 0;
2933         loff_t end = LLONG_MAX;
2934 #endif
2935         struct inode *inode = dentry->d_inode;
2936         struct ll_inode_info *lli = ll_i2info(inode);
2937         struct ptlrpc_request *req;
2938         int rc, err;
2939         ENTRY;
2940
2941         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2942                PFID(ll_inode2fid(inode)), inode);
2943         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2944
2945 #ifdef HAVE_FILE_FSYNC_4ARGS
2946         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2947         inode_lock(inode);
2948 #else
2949         /* fsync's caller has already called _fdata{sync,write}, we want
2950          * that IO to finish before calling the osc and mdc sync methods */
2951         rc = filemap_fdatawait(inode->i_mapping);
2952 #endif
2953
2954         /* catch async errors that were recorded back when async writeback
2955          * failed for pages in this mapping. */
2956         if (!S_ISDIR(inode->i_mode)) {
2957                 err = lli->lli_async_rc;
2958                 lli->lli_async_rc = 0;
2959                 if (rc == 0)
2960                         rc = err;
2961                 if (lli->lli_clob != NULL) {
2962                         err = lov_read_and_clear_async_rc(lli->lli_clob);
2963                         if (rc == 0)
2964                                 rc = err;
2965                 }
2966         }
2967
2968         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2969         if (!rc)
2970                 rc = err;
2971         if (!err)
2972                 ptlrpc_req_finished(req);
2973
2974         if (S_ISREG(inode->i_mode)) {
2975                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2976
2977                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2978                 if (rc == 0 && err < 0)
2979                         rc = err;
2980                 if (rc < 0)
2981                         fd->fd_write_failed = true;
2982                 else
2983                         fd->fd_write_failed = false;
2984         }
2985
2986 #ifdef HAVE_FILE_FSYNC_4ARGS
2987         inode_unlock(inode);
2988 #endif
2989         RETURN(rc);
2990 }
2991
2992 static int
2993 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2994 {
2995         struct inode *inode = file_inode(file);
2996         struct ll_sb_info *sbi = ll_i2sbi(inode);
2997         struct ldlm_enqueue_info einfo = {
2998                 .ei_type        = LDLM_FLOCK,
2999                 .ei_cb_cp       = ldlm_flock_completion_ast,
3000                 .ei_cbdata      = file_lock,
3001         };
3002         struct md_op_data *op_data;
3003         struct lustre_handle lockh = { 0 };
3004         union ldlm_policy_data flock = { { 0 } };
3005         int fl_type = file_lock->fl_type;
3006         __u64 flags = 0;
3007         int rc;
3008         int rc2 = 0;
3009         ENTRY;
3010
3011         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3012                PFID(ll_inode2fid(inode)), file_lock);
3013
3014         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3015
3016         if (file_lock->fl_flags & FL_FLOCK) {
3017                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3018                 /* flocks are whole-file locks */
3019                 flock.l_flock.end = OFFSET_MAX;
3020                 /* For flocks owner is determined by the local file desctiptor*/
3021                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3022         } else if (file_lock->fl_flags & FL_POSIX) {
3023                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3024                 flock.l_flock.start = file_lock->fl_start;
3025                 flock.l_flock.end = file_lock->fl_end;
3026         } else {
3027                 RETURN(-EINVAL);
3028         }
3029         flock.l_flock.pid = file_lock->fl_pid;
3030
3031         /* Somewhat ugly workaround for svc lockd.
3032          * lockd installs custom fl_lmops->lm_compare_owner that checks
3033          * for the fl_owner to be the same (which it always is on local node
3034          * I guess between lockd processes) and then compares pid.
3035          * As such we assign pid to the owner field to make it all work,
3036          * conflict with normal locks is unlikely since pid space and
3037          * pointer space for current->files are not intersecting */
3038         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3039                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3040
3041         switch (fl_type) {
3042         case F_RDLCK:
3043                 einfo.ei_mode = LCK_PR;
3044                 break;
3045         case F_UNLCK:
3046                 /* An unlock request may or may not have any relation to
3047                  * existing locks so we may not be able to pass a lock handle
3048                  * via a normal ldlm_lock_cancel() request. The request may even
3049                  * unlock a byte range in the middle of an existing lock. In
3050                  * order to process an unlock request we need all of the same
3051                  * information that is given with a normal read or write record
3052                  * lock request. To avoid creating another ldlm unlock (cancel)
3053                  * message we'll treat a LCK_NL flock request as an unlock. */
3054                 einfo.ei_mode = LCK_NL;
3055                 break;
3056         case F_WRLCK:
3057                 einfo.ei_mode = LCK_PW;
3058                 break;
3059         default:
3060                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3061                 RETURN (-ENOTSUPP);
3062         }
3063
3064         switch (cmd) {
3065         case F_SETLKW:
3066 #ifdef F_SETLKW64
3067         case F_SETLKW64:
3068 #endif
3069                 flags = 0;
3070                 break;
3071         case F_SETLK:
3072 #ifdef F_SETLK64
3073         case F_SETLK64:
3074 #endif
3075                 flags = LDLM_FL_BLOCK_NOWAIT;
3076                 break;
3077         case F_GETLK:
3078 #ifdef F_GETLK64
3079         case F_GETLK64:
3080 #endif
3081                 flags = LDLM_FL_TEST_LOCK;
3082                 break;
3083         default:
3084                 CERROR("unknown fcntl lock command: %d\n", cmd);
3085                 RETURN (-EINVAL);
3086         }
3087
3088         /* Save the old mode so that if the mode in the lock changes we
3089          * can decrement the appropriate reader or writer refcount. */
3090         file_lock->fl_type = einfo.ei_mode;
3091
3092         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3093                                      LUSTRE_OPC_ANY, NULL);
3094         if (IS_ERR(op_data))
3095                 RETURN(PTR_ERR(op_data));
3096
3097         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3098                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3099                flock.l_flock.pid, flags, einfo.ei_mode,
3100                flock.l_flock.start, flock.l_flock.end);
3101
3102         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3103                         flags);
3104
3105         /* Restore the file lock type if not TEST lock. */
3106         if (!(flags & LDLM_FL_TEST_LOCK))
3107                 file_lock->fl_type = fl_type;
3108
3109 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3110         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3111             !(flags & LDLM_FL_TEST_LOCK))
3112                 rc2  = locks_lock_file_wait(file, file_lock);
3113 #else
3114         if ((file_lock->fl_flags & FL_FLOCK) &&
3115             (rc == 0 || file_lock->fl_type == F_UNLCK))
3116                 rc2  = flock_lock_file_wait(file, file_lock);
3117         if ((file_lock->fl_flags & FL_POSIX) &&
3118             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3119             !(flags & LDLM_FL_TEST_LOCK))
3120                 rc2  = posix_lock_file_wait(file, file_lock);
3121 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3122
3123         if (rc2 && file_lock->fl_type != F_UNLCK) {
3124                 einfo.ei_mode = LCK_NL;
3125                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3126                            &lockh, flags);
3127                 rc = rc2;
3128         }
3129
3130         ll_finish_md_op_data(op_data);
3131
3132         RETURN(rc);
3133 }
3134
3135 int ll_get_fid_by_name(struct inode *parent, const char *name,
3136                        int namelen, struct lu_fid *fid,
3137                        struct inode **inode)
3138 {
3139         struct md_op_data       *op_data = NULL;
3140         struct mdt_body         *body;
3141         struct ptlrpc_request   *req;
3142         int                     rc;
3143         ENTRY;
3144
3145         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3146                                      LUSTRE_OPC_ANY, NULL);
3147         if (IS_ERR(op_data))
3148                 RETURN(PTR_ERR(op_data));
3149
3150         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3151         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3152         ll_finish_md_op_data(op_data);
3153         if (rc < 0)
3154                 RETURN(rc);
3155
3156         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3157         if (body == NULL)
3158                 GOTO(out_req, rc = -EFAULT);
3159         if (fid != NULL)
3160                 *fid = body->mbo_fid1;
3161
3162         if (inode != NULL)
3163                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3164 out_req:
3165         ptlrpc_req_finished(req);
3166         RETURN(rc);
3167 }
3168
3169 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3170                const char *name, int namelen)
3171 {
3172         struct dentry         *dchild = NULL;
3173         struct inode          *child_inode = NULL;
3174         struct md_op_data     *op_data;
3175         struct ptlrpc_request *request = NULL;
3176         struct obd_client_handle *och = NULL;
3177         struct qstr           qstr;
3178         struct mdt_body         *body;
3179         int                    rc;
3180         __u64                   data_version = 0;
3181         ENTRY;
3182
3183         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3184                name, PFID(ll_inode2fid(parent)), mdtidx);
3185
3186         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3187                                      0, LUSTRE_OPC_ANY, NULL);
3188         if (IS_ERR(op_data))
3189                 RETURN(PTR_ERR(op_data));
3190
3191         /* Get child FID first */
3192         qstr.hash = full_name_hash(name, namelen);
3193         qstr.name = name;
3194         qstr.len = namelen;
3195         dchild = d_lookup(file_dentry(file), &qstr);
3196         if (dchild != NULL) {
3197                 if (dchild->d_inode != NULL)
3198                         child_inode = igrab(dchild->d_inode);
3199                 dput(dchild);
3200         }
3201
3202         if (child_inode == NULL) {
3203                 rc = ll_get_fid_by_name(parent, name, namelen,
3204                                         &op_data->op_fid3, &child_inode);
3205                 if (rc != 0)
3206                         GOTO(out_free, rc);
3207         }
3208
3209         if (child_inode == NULL)
3210                 GOTO(out_free, rc = -EINVAL);
3211
3212         /*
3213          * lfs migrate command needs to be blocked on the client
3214          * by checking the migrate FID against the FID of the
3215          * filesystem root.
3216          */
3217         if (child_inode == parent->i_sb->s_root->d_inode)
3218                 GOTO(out_iput, rc = -EINVAL);
3219
3220         inode_lock(child_inode);
3221         op_data->op_fid3 = *ll_inode2fid(child_inode);
3222         if (!fid_is_sane(&op_data->op_fid3)) {
3223                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3224                        ll_get_fsname(parent->i_sb, NULL, 0), name,
3225                        PFID(&op_data->op_fid3));
3226                 GOTO(out_unlock, rc = -EINVAL);
3227         }
3228
3229         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3230         if (rc < 0)
3231                 GOTO(out_unlock, rc);
3232
3233         if (rc == mdtidx) {
3234                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3235                        PFID(&op_data->op_fid3), mdtidx);
3236                 GOTO(out_unlock, rc = 0);
3237         }
3238 again:
3239         if (S_ISREG(child_inode->i_mode)) {
3240                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3241                 if (IS_ERR(och)) {
3242                         rc = PTR_ERR(och);
3243                         och = NULL;
3244                         GOTO(out_unlock, rc);
3245                 }
3246
3247                 rc = ll_data_version(child_inode, &data_version,
3248                                      LL_DV_WR_FLUSH);
3249                 if (rc != 0)
3250                         GOTO(out_close, rc);
3251
3252                 op_data->op_handle = och->och_fh;
3253                 op_data->op_data = och->och_mod;
3254                 op_data->op_data_version = data_version;
3255                 op_data->op_lease_handle = och->och_lease_handle;
3256                 op_data->op_bias |= MDS_RENAME_MIGRATE;
3257         }
3258
3259         op_data->op_mds = mdtidx;
3260         op_data->op_cli_flags = CLI_MIGRATE;
3261         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3262                        namelen, name, namelen, &request);
3263         if (rc == 0) {
3264                 LASSERT(request != NULL);
3265                 ll_update_times(request, parent);
3266
3267                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3268                 LASSERT(body != NULL);
3269
3270                 /* If the server does release layout lock, then we cleanup
3271                  * the client och here, otherwise release it in out_close: */
3272                 if (och != NULL &&
3273                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3274                         obd_mod_put(och->och_mod);
3275                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3276                                                   och);
3277                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3278                         OBD_FREE_PTR(och);
3279                         och = NULL;
3280                 }
3281         }
3282
3283         if (request != NULL) {
3284                 ptlrpc_req_finished(request);
3285                 request = NULL;
3286         }
3287
3288         /* Try again if the file layout has changed. */
3289         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3290                 goto again;
3291
3292 out_close:
3293         if (och != NULL) /* close the file */
3294                 ll_lease_close(och, child_inode, NULL);
3295         if (rc == 0)
3296                 clear_nlink(child_inode);
3297 out_unlock:
3298         inode_unlock(child_inode);
3299 out_iput:
3300         iput(child_inode);
3301 out_free:
3302         ll_finish_md_op_data(op_data);
3303         RETURN(rc);
3304 }
3305
3306 static int
3307 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3308 {
3309         ENTRY;
3310
3311         RETURN(-ENOSYS);
3312 }
3313
3314 /**
3315  * test if some locks matching bits and l_req_mode are acquired
3316  * - bits can be in different locks
3317  * - if found clear the common lock bits in *bits
3318  * - the bits not found, are kept in *bits
3319  * \param inode [IN]
3320  * \param bits [IN] searched lock bits [IN]
3321  * \param l_req_mode [IN] searched lock mode
3322  * \retval boolean, true iff all bits are found
3323  */
3324 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3325 {
3326         struct lustre_handle lockh;
3327         union ldlm_policy_data policy;
3328         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3329                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3330         struct lu_fid *fid;
3331         __u64 flags;
3332         int i;
3333         ENTRY;
3334
3335         if (!inode)
3336                RETURN(0);
3337
3338         fid = &ll_i2info(inode)->lli_fid;
3339         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3340                ldlm_lockname[mode]);
3341
3342         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3343         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3344                 policy.l_inodebits.bits = *bits & (1 << i);
3345                 if (policy.l_inodebits.bits == 0)
3346                         continue;
3347
3348                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3349                                   &policy, mode, &lockh)) {
3350                         struct ldlm_lock *lock;
3351
3352                         lock = ldlm_handle2lock(&lockh);
3353                         if (lock) {
3354                                 *bits &=
3355                                       ~(lock->l_policy_data.l_inodebits.bits);
3356                                 LDLM_LOCK_PUT(lock);
3357                         } else {
3358                                 *bits &= ~policy.l_inodebits.bits;
3359                         }
3360                 }
3361         }
3362         RETURN(*bits == 0);
3363 }
3364
3365 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3366                                struct lustre_handle *lockh, __u64 flags,
3367                                enum ldlm_mode mode)
3368 {
3369         union ldlm_policy_data policy = { .l_inodebits = { bits } };
3370         struct lu_fid *fid;
3371         enum ldlm_mode rc;
3372         ENTRY;
3373
3374         fid = &ll_i2info(inode)->lli_fid;
3375         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3376
3377         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3378                            fid, LDLM_IBITS, &policy, mode, lockh);
3379
3380         RETURN(rc);
3381 }
3382
3383 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3384 {
3385         /* Already unlinked. Just update nlink and return success */
3386         if (rc == -ENOENT) {
3387                 clear_nlink(inode);
3388                 /* If it is striped directory, and there is bad stripe
3389                  * Let's revalidate the dentry again, instead of returning
3390                  * error */
3391                 if (S_ISDIR(inode->i_mode) &&
3392                     ll_i2info(inode)->lli_lsm_md != NULL)
3393                         return 0;
3394
3395                 /* This path cannot be hit for regular files unless in
3396                  * case of obscure races, so no need to to validate
3397                  * size. */
3398                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3399                         return 0;
3400         } else if (rc != 0) {
3401                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3402                              "%s: revalidate FID "DFID" error: rc = %d\n",
3403                              ll_get_fsname(inode->i_sb, NULL, 0),
3404                              PFID(ll_inode2fid(inode)), rc);
3405         }
3406
3407         return rc;
3408 }
3409
3410 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3411 {
3412         struct inode *inode = dentry->d_inode;
3413         struct ptlrpc_request *req = NULL;
3414         struct obd_export *exp;
3415         int rc = 0;
3416         ENTRY;
3417
3418         LASSERT(inode != NULL);
3419
3420         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3421                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3422
3423         exp = ll_i2mdexp(inode);
3424
3425         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3426          *      But under CMD case, it caused some lock issues, should be fixed
3427          *      with new CMD ibits lock. See bug 12718 */
3428         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3429                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3430                 struct md_op_data *op_data;
3431
3432                 if (ibits == MDS_INODELOCK_LOOKUP)
3433                         oit.it_op = IT_LOOKUP;
3434
3435                 /* Call getattr by fid, so do not provide name at all. */
3436                 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3437                                              dentry->d_inode, NULL, 0, 0,
3438                                              LUSTRE_OPC_ANY, NULL);
3439                 if (IS_ERR(op_data))
3440                         RETURN(PTR_ERR(op_data));
3441
3442                 rc = md_intent_lock(exp, op_data, &oit, &req,
3443                                     &ll_md_blocking_ast, 0);
3444                 ll_finish_md_op_data(op_data);
3445                 if (rc < 0) {
3446                         rc = ll_inode_revalidate_fini(inode, rc);
3447                         GOTO (out, rc);
3448                 }
3449
3450                 rc = ll_revalidate_it_finish(req, &oit, dentry);
3451                 if (rc != 0) {
3452                         ll_intent_release(&oit);
3453                         GOTO(out, rc);
3454                 }
3455
3456                 /* Unlinked? Unhash dentry, so it is not picked up later by
3457                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3458                    here to preserve get_cwd functionality on 2.6.
3459                    Bug 10503 */
3460                 if (!dentry->d_inode->i_nlink) {
3461                         ll_lock_dcache(inode);
3462                         d_lustre_invalidate(dentry, 0);
3463                         ll_unlock_dcache(inode);
3464                 }
3465
3466                 ll_lookup_finish_locks(&oit, dentry);
3467         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3468                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3469                 u64 valid = OBD_MD_FLGETATTR;
3470                 struct md_op_data *op_data;
3471                 int ealen = 0;
3472
3473                 if (S_ISREG(inode->i_mode)) {
3474                         rc = ll_get_default_mdsize(sbi, &ealen);
3475                         if (rc)
3476                                 RETURN(rc);
3477                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3478                 }
3479
3480                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3481                                              0, ealen, LUSTRE_OPC_ANY,
3482                                              NULL);
3483                 if (IS_ERR(op_data))
3484                         RETURN(PTR_ERR(op_data));
3485
3486                 op_data->op_valid = valid;
3487                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3488                 ll_finish_md_op_data(op_data);
3489                 if (rc) {
3490                         rc = ll_inode_revalidate_fini(inode, rc);
3491                         RETURN(rc);
3492                 }
3493
3494                 rc = ll_prep_inode(&inode, req, NULL, NULL);
3495         }
3496 out:
3497         ptlrpc_req_finished(req);
3498         return rc;
3499 }
3500
3501 static int ll_merge_md_attr(struct inode *inode)
3502 {
3503         struct cl_attr attr = { 0 };
3504         int rc;
3505
3506         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3507         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3508                            &attr, ll_md_blocking_ast);
3509         if (rc != 0)
3510                 RETURN(rc);
3511
3512         set_nlink(inode, attr.cat_nlink);
3513         inode->i_blocks = attr.cat_blocks;
3514         i_size_write(inode, attr.cat_size);
3515
3516         ll_i2info(inode)->lli_atime = attr.cat_atime;
3517         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3518         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3519
3520         RETURN(0);
3521 }
3522
3523 static int
3524 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3525 {
3526         struct inode    *inode = dentry->d_inode;
3527         int              rc;
3528         ENTRY;
3529
3530         rc = __ll_inode_revalidate(dentry, ibits);
3531         if (rc != 0)
3532                 RETURN(rc);
3533
3534         /* if object isn't regular file, don't validate size */
3535         if (!S_ISREG(inode->i_mode)) {
3536                 if (S_ISDIR(inode->i_mode) &&
3537                     ll_i2info(inode)->lli_lsm_md != NULL) {
3538                         rc = ll_merge_md_attr(inode);
3539                         if (rc != 0)
3540                                 RETURN(rc);
3541                 }
3542
3543                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3544                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3545                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3546         } else {
3547                 /* In case of restore, the MDT has the right size and has
3548                  * already send it back without granting the layout lock,
3549                  * inode is up-to-date so glimpse is useless.
3550                  * Also to glimpse we need the layout, in case of a running
3551                  * restore the MDT holds the layout lock so the glimpse will
3552                  * block up to the end of restore (getattr will block)
3553                  */
3554                 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3555                         rc = ll_glimpse_size(inode);
3556         }
3557         RETURN(rc);
3558 }
3559
3560 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3561 {
3562         struct inode *inode = de->d_inode;
3563         struct ll_sb_info *sbi = ll_i2sbi(inode);
3564         struct ll_inode_info *lli = ll_i2info(inode);
3565         int res = 0;
3566
3567         res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3568                                       MDS_INODELOCK_LOOKUP);
3569         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3570
3571         if (res)
3572                 return res;
3573
3574         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3575
3576         stat->dev = inode->i_sb->s_dev;
3577         if (ll_need_32bit_api(sbi))
3578                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3579         else
3580                 stat->ino = inode->i_ino;
3581         stat->mode = inode->i_mode;
3582         stat->uid = inode->i_uid;
3583         stat->gid = inode->i_gid;
3584         stat->rdev = inode->i_rdev;
3585         stat->atime = inode->i_atime;
3586         stat->mtime = inode->i_mtime;
3587         stat->ctime = inode->i_ctime;
3588         stat->blksize = 1 << inode->i_blkbits;
3589
3590         stat->nlink = inode->i_nlink;
3591         stat->size = i_size_read(inode);
3592         stat->blocks = inode->i_blocks;
3593
3594         return 0;
3595 }
3596
3597 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3598                      __u64 start, __u64 len)
3599 {
3600         int             rc;
3601         size_t          num_bytes;
3602         struct fiemap   *fiemap;
3603         unsigned int    extent_count = fieinfo->fi_extents_max;
3604
3605         num_bytes = sizeof(*fiemap) + (extent_count *
3606                                        sizeof(struct fiemap_extent));
3607         OBD_ALLOC_LARGE(fiemap, num_bytes);
3608
3609         if (fiemap == NULL)
3610                 RETURN(-ENOMEM);
3611
3612         fiemap->fm_flags = fieinfo->fi_flags;
3613         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3614         fiemap->fm_start = start;
3615         fiemap->fm_length = len;
3616         if (extent_count > 0 &&
3617             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3618                            sizeof(struct fiemap_extent)) != 0)
3619                 GOTO(out, rc = -EFAULT);
3620
3621         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3622
3623         fieinfo->fi_flags = fiemap->fm_flags;
3624         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3625         if (extent_count > 0 &&
3626             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3627                          fiemap->fm_mapped_extents *
3628                          sizeof(struct fiemap_extent)) != 0)
3629                 GOTO(out, rc = -EFAULT);
3630 out:
3631         OBD_FREE_LARGE(fiemap, num_bytes);
3632         return rc;
3633 }
3634
3635 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3636 {
3637         struct ll_inode_info *lli = ll_i2info(inode);
3638         struct posix_acl *acl = NULL;
3639         ENTRY;
3640
3641         spin_lock(&lli->lli_lock);
3642         /* VFS' acl_permission_check->check_acl will release the refcount */
3643         acl = posix_acl_dup(lli->lli_posix_acl);
3644         spin_unlock(&lli->lli_lock);
3645
3646         RETURN(acl);
3647 }
3648
3649 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3650 static int
3651 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3652 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3653 # else
3654 ll_check_acl(struct inode *inode, int mask)
3655 # endif
3656 {
3657 # ifdef CONFIG_FS_POSIX_ACL
3658         struct posix_acl *acl;
3659         int rc;
3660         ENTRY;
3661
3662 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
3663         if (flags & IPERM_FLAG_RCU)
3664                 return -ECHILD;
3665 #  endif
3666         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3667
3668         if (!acl)
3669                 RETURN(-EAGAIN);
3670
3671         rc = posix_acl_permission(inode, acl, mask);
3672         posix_acl_release(acl);
3673
3674         RETURN(rc);
3675 # else /* !CONFIG_FS_POSIX_ACL */
3676         return -EAGAIN;
3677 # endif /* CONFIG_FS_POSIX_ACL */
3678 }
3679 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3680
3681 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3682 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3683 #else
3684 # ifdef HAVE_INODE_PERMISION_2ARGS
3685 int ll_inode_permission(struct inode *inode, int mask)
3686 # else
3687 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3688 # endif
3689 #endif
3690 {
3691         int rc = 0;
3692         struct ll_sb_info *sbi;
3693         struct root_squash_info *squash;
3694         struct cred *cred = NULL;
3695         const struct cred *old_cred = NULL;
3696         cfs_cap_t cap;
3697         bool squash_id = false;
3698         ENTRY;
3699
3700 #ifdef MAY_NOT_BLOCK
3701         if (mask & MAY_NOT_BLOCK)
3702                 return -ECHILD;
3703 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3704         if (flags & IPERM_FLAG_RCU)
3705                 return -ECHILD;
3706 #endif
3707
3708        /* as root inode are NOT getting validated in lookup operation,
3709         * need to do it before permission check. */
3710
3711         if (inode == inode->i_sb->s_root->d_inode) {
3712                 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3713                                            MDS_INODELOCK_LOOKUP);
3714                 if (rc)
3715                         RETURN(rc);
3716         }
3717
3718         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3719                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3720
3721         /* squash fsuid/fsgid if needed */
3722         sbi = ll_i2sbi(inode);
3723         squash = &sbi->ll_squash;
3724         if (unlikely(squash->rsi_uid != 0 &&
3725                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3726                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3727                         squash_id = true;
3728         }
3729         if (squash_id) {
3730                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3731                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3732                        squash->rsi_uid, squash->rsi_gid);
3733
3734                 /* update current process's credentials
3735                  * and FS capability */
3736                 cred = prepare_creds();
3737                 if (cred == NULL)
3738                         RETURN(-ENOMEM);
3739
3740                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3741                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3742                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3743                         if ((1 << cap) & CFS_CAP_FS_MASK)
3744                                 cap_lower(cred->cap_effective, cap);
3745                 }
3746                 old_cred = override_creds(cred);
3747         }
3748
3749         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3750         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3751         /* restore current process's credentials and FS capability */
3752         if (squash_id) {
3753                 revert_creds(old_cred);
3754                 put_cred(cred);
3755         }
3756
3757         RETURN(rc);
3758 }
3759
3760 /* -o localflock - only provides locally consistent flock locks */
3761 struct file_operations ll_file_operations = {
3762 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3763 # ifdef HAVE_SYNC_READ_WRITE
3764         .read           = new_sync_read,
3765         .write          = new_sync_write,
3766 # endif
3767         .read_iter      = ll_file_read_iter,
3768         .write_iter     = ll_file_write_iter,
3769 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3770         .read           = ll_file_read,
3771         .aio_read       = ll_file_aio_read,
3772         .write          = ll_file_write,
3773         .aio_write      = ll_file_aio_write,
3774 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3775         .unlocked_ioctl = ll_file_ioctl,
3776         .open           = ll_file_open,
3777         .release        = ll_file_release,
3778         .mmap           = ll_file_mmap,
3779         .llseek         = ll_file_seek,
3780         .splice_read    = ll_file_splice_read,
3781         .fsync          = ll_fsync,
3782         .flush          = ll_flush
3783 };
3784
3785 struct file_operations ll_file_operations_flock = {
3786 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3787 # ifdef HAVE_SYNC_READ_WRITE
3788         .read           = new_sync_read,
3789         .write          = new_sync_write,
3790 # endif /* HAVE_SYNC_READ_WRITE */
3791         .read_iter      = ll_file_read_iter,
3792         .write_iter     = ll_file_write_iter,
3793 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3794         .read           = ll_file_read,
3795         .aio_read       = ll_file_aio_read,
3796         .write          = ll_file_write,
3797         .aio_write      = ll_file_aio_write,
3798 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3799         .unlocked_ioctl = ll_file_ioctl,
3800         .open           = ll_file_open,
3801         .release        = ll_file_release,
3802         .mmap           = ll_file_mmap,
3803         .llseek         = ll_file_seek,
3804         .splice_read    = ll_file_splice_read,
3805         .fsync          = ll_fsync,
3806         .flush          = ll_flush,
3807         .flock          = ll_file_flock,
3808         .lock           = ll_file_flock
3809 };
3810
3811 /* These are for -o noflock - to return ENOSYS on flock calls */
3812 struct file_operations ll_file_operations_noflock = {
3813 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3814 # ifdef HAVE_SYNC_READ_WRITE
3815         .read           = new_sync_read,
3816         .write          = new_sync_write,
3817 # endif /* HAVE_SYNC_READ_WRITE */
3818         .read_iter      = ll_file_read_iter,
3819         .write_iter     = ll_file_write_iter,
3820 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3821         .read           = ll_file_read,
3822         .aio_read       = ll_file_aio_read,
3823         .write          = ll_file_write,
3824         .aio_write      = ll_file_aio_write,
3825 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3826         .unlocked_ioctl = ll_file_ioctl,
3827         .open           = ll_file_open,
3828         .release        = ll_file_release,
3829         .mmap           = ll_file_mmap,
3830         .llseek         = ll_file_seek,
3831         .splice_read    = ll_file_splice_read,
3832         .fsync          = ll_fsync,
3833         .flush          = ll_flush,
3834         .flock          = ll_file_noflock,
3835         .lock           = ll_file_noflock
3836 };
3837
3838 struct inode_operations ll_file_inode_operations = {
3839         .setattr        = ll_setattr,
3840         .getattr        = ll_getattr,
3841         .permission     = ll_inode_permission,
3842         .setxattr       = ll_setxattr,
3843         .getxattr       = ll_getxattr,
3844         .listxattr      = ll_listxattr,
3845         .removexattr    = ll_removexattr,
3846         .fiemap         = ll_fiemap,
3847 #ifdef HAVE_IOP_GET_ACL
3848         .get_acl        = ll_get_acl,
3849 #endif
3850 };
3851
3852 /* dynamic ioctl number support routins */
3853 static struct llioc_ctl_data {
3854         struct rw_semaphore     ioc_sem;
3855         struct list_head        ioc_head;
3856 } llioc = {
3857         __RWSEM_INITIALIZER(llioc.ioc_sem),
3858         LIST_HEAD_INIT(llioc.ioc_head)
3859 };
3860
3861
3862 struct llioc_data {
3863         struct list_head        iocd_list;
3864         unsigned int            iocd_size;
3865         llioc_callback_t        iocd_cb;
3866         unsigned int            iocd_count;
3867         unsigned int            iocd_cmd[0];
3868 };
3869
3870 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3871 {
3872         unsigned int size;
3873         struct llioc_data *in_data = NULL;
3874         ENTRY;
3875
3876         if (cb == NULL || cmd == NULL ||
3877             count > LLIOC_MAX_CMD || count < 0)
3878                 RETURN(NULL);
3879
3880         size = sizeof(*in_data) + count * sizeof(unsigned int);
3881         OBD_ALLOC(in_data, size);
3882         if (in_data == NULL)
3883                 RETURN(NULL);
3884
3885         memset(in_data, 0, sizeof(*in_data));
3886         in_data->iocd_size = size;
3887         in_data->iocd_cb = cb;
3888         in_data->iocd_count = count;
3889         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3890
3891         down_write(&llioc.ioc_sem);
3892         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3893         up_write(&llioc.ioc_sem);
3894
3895         RETURN(in_data);
3896 }
3897
3898 void ll_iocontrol_unregister(void *magic)
3899 {
3900         struct llioc_data *tmp;
3901
3902         if (magic == NULL)
3903                 return;
3904
3905         down_write(&llioc.ioc_sem);
3906         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3907                 if (tmp == magic) {
3908                         unsigned int size = tmp->iocd_size;
3909
3910                         list_del(&tmp->iocd_list);
3911                         up_write(&llioc.ioc_sem);
3912
3913                         OBD_FREE(tmp, size);
3914                         return;
3915                 }
3916         }
3917         up_write(&llioc.ioc_sem);
3918
3919         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3920 }
3921
3922 EXPORT_SYMBOL(ll_iocontrol_register);
3923 EXPORT_SYMBOL(ll_iocontrol_unregister);
3924
3925 static enum llioc_iter
3926 ll_iocontrol_call(struct inode *inode, struct file *file,
3927                   unsigned int cmd, unsigned long arg, int *rcp)
3928 {
3929         enum llioc_iter ret = LLIOC_CONT;
3930         struct llioc_data *data;
3931         int rc = -EINVAL, i;
3932
3933         down_read(&llioc.ioc_sem);
3934         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3935                 for (i = 0; i < data->iocd_count; i++) {
3936                         if (cmd != data->iocd_cmd[i])
3937                                 continue;
3938
3939                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3940                         break;
3941                 }
3942
3943                 if (ret == LLIOC_STOP)
3944                         break;
3945         }
3946         up_read(&llioc.ioc_sem);
3947
3948         if (rcp)
3949                 *rcp = rc;
3950         return ret;
3951 }
3952
3953 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3954 {
3955         struct ll_inode_info *lli = ll_i2info(inode);
3956         struct cl_object *obj = lli->lli_clob;
3957         struct lu_env *env;
3958         int rc;
3959         __u16 refcheck;
3960         ENTRY;
3961
3962         if (obj == NULL)
3963                 RETURN(0);
3964
3965         env = cl_env_get(&refcheck);
3966         if (IS_ERR(env))
3967                 RETURN(PTR_ERR(env));
3968
3969         rc = cl_conf_set(env, lli->lli_clob, conf);
3970         if (rc < 0)
3971                 GOTO(out, rc);
3972
3973         if (conf->coc_opc == OBJECT_CONF_SET) {
3974                 struct ldlm_lock *lock = conf->coc_lock;
3975                 struct cl_layout cl = {
3976                         .cl_layout_gen = 0,
3977                 };
3978
3979                 LASSERT(lock != NULL);
3980                 LASSERT(ldlm_has_layout(lock));
3981
3982                 /* it can only be allowed to match after layout is
3983                  * applied to inode otherwise false layout would be
3984                  * seen. Applying layout shoud happen before dropping
3985                  * the intent lock. */
3986                 ldlm_lock_allow_match(lock);
3987
3988                 rc = cl_object_layout_get(env, obj, &cl);
3989                 if (rc < 0)
3990                         GOTO(out, rc);
3991
3992                 CDEBUG(D_VFSTRACE,
3993                        DFID": layout version change: %u -> %u\n",
3994                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
3995                        cl.cl_layout_gen);
3996                 ll_layout_version_set(lli, cl.cl_layout_gen);
3997         }
3998
3999 out:
4000         cl_env_put(env, &refcheck);
4001
4002         RETURN(rc);
4003 }
4004
4005 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4006 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4007
4008 {
4009         struct ll_sb_info *sbi = ll_i2sbi(inode);
4010         struct ptlrpc_request *req;
4011         struct mdt_body *body;
4012         void *lvbdata;
4013         void *lmm;
4014         int lmmsize;
4015         int rc;
4016         ENTRY;
4017
4018         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4019                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4020                lock->l_lvb_data, lock->l_lvb_len);
4021
4022         if (lock->l_lvb_data != NULL)
4023                 RETURN(0);
4024
4025         /* if layout lock was granted right away, the layout is returned
4026          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4027          * blocked and then granted via completion ast, we have to fetch
4028          * layout here. Please note that we can't use the LVB buffer in
4029          * completion AST because it doesn't have a large enough buffer */
4030         rc = ll_get_default_mdsize(sbi, &lmmsize);
4031         if (rc == 0)
4032                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4033                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4034                                 lmmsize, 0, &req);
4035         if (rc < 0)
4036                 RETURN(rc);
4037
4038         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4039         if (body == NULL)
4040                 GOTO(out, rc = -EPROTO);
4041
4042         lmmsize = body->mbo_eadatasize;
4043         if (lmmsize == 0) /* empty layout */
4044                 GOTO(out, rc = 0);
4045
4046         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4047         if (lmm == NULL)
4048                 GOTO(out, rc = -EFAULT);
4049
4050         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4051         if (lvbdata == NULL)
4052                 GOTO(out, rc = -ENOMEM);
4053
4054         memcpy(lvbdata, lmm, lmmsize);
4055         lock_res_and_lock(lock);
4056         if (unlikely(lock->l_lvb_data == NULL)) {
4057                 lock->l_lvb_type = LVB_T_LAYOUT;
4058                 lock->l_lvb_data = lvbdata;
4059                 lock->l_lvb_len = lmmsize;
4060                 lvbdata = NULL;
4061         }
4062         unlock_res_and_lock(lock);
4063
4064         if (lvbdata)
4065                 OBD_FREE_LARGE(lvbdata, lmmsize);
4066
4067         EXIT;
4068
4069 out:
4070         ptlrpc_req_finished(req);
4071         return rc;
4072 }
4073
4074 /**
4075  * Apply the layout to the inode. Layout lock is held and will be released
4076  * in this function.
4077  */
4078 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4079                               struct inode *inode)
4080 {
4081         struct ll_inode_info *lli = ll_i2info(inode);
4082         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4083         struct ldlm_lock *lock;
4084         struct cl_object_conf conf;
4085         int rc = 0;
4086         bool lvb_ready;
4087         bool wait_layout = false;
4088         ENTRY;
4089
4090         LASSERT(lustre_handle_is_used(lockh));
4091
4092         lock = ldlm_handle2lock(lockh);
4093         LASSERT(lock != NULL);
4094         LASSERT(ldlm_has_layout(lock));
4095
4096         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4097                    PFID(&lli->lli_fid), inode);
4098
4099         /* in case this is a caching lock and reinstate with new inode */
4100         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4101
4102         lock_res_and_lock(lock);
4103         lvb_ready = ldlm_is_lvb_ready(lock);
4104         unlock_res_and_lock(lock);
4105         /* checking lvb_ready is racy but this is okay. The worst case is
4106          * that multi processes may configure the file on the same time. */
4107
4108         if (lvb_ready)
4109                 GOTO(out, rc = 0);
4110
4111         rc = ll_layout_fetch(inode, lock);
4112         if (rc < 0)
4113                 GOTO(out, rc);
4114
4115         /* for layout lock, lmm is stored in lock's lvb.
4116          * lvb_data is immutable if the lock is held so it's safe to access it
4117          * without res lock.
4118          *
4119          * set layout to file. Unlikely this will fail as old layout was
4120          * surely eliminated */
4121         memset(&conf, 0, sizeof conf);
4122         conf.coc_opc = OBJECT_CONF_SET;
4123         conf.coc_inode = inode;
4124         conf.coc_lock = lock;
4125         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4126         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4127         rc = ll_layout_conf(inode, &conf);
4128
4129         /* refresh layout failed, need to wait */
4130         wait_layout = rc == -EBUSY;
4131         EXIT;
4132
4133 out:
4134         LDLM_LOCK_PUT(lock);
4135         ldlm_lock_decref(lockh, mode);
4136
4137         /* wait for IO to complete if it's still being used. */
4138         if (wait_layout) {
4139                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4140                        ll_get_fsname(inode->i_sb, NULL, 0),
4141                        PFID(&lli->lli_fid), inode);
4142
4143                 memset(&conf, 0, sizeof conf);
4144                 conf.coc_opc = OBJECT_CONF_WAIT;
4145                 conf.coc_inode = inode;
4146                 rc = ll_layout_conf(inode, &conf);
4147                 if (rc == 0)
4148                         rc = -EAGAIN;
4149
4150                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4151                        ll_get_fsname(inode->i_sb, NULL, 0),
4152                        PFID(&lli->lli_fid), rc);
4153         }
4154         RETURN(rc);
4155 }
4156
4157 static int ll_layout_refresh_locked(struct inode *inode)
4158 {
4159         struct ll_inode_info  *lli = ll_i2info(inode);
4160         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4161         struct md_op_data     *op_data;
4162         struct lookup_intent    it;
4163         struct lustre_handle    lockh;
4164         enum ldlm_mode          mode;
4165         struct ptlrpc_request *req;
4166         int rc;
4167         ENTRY;
4168
4169 again:
4170         /* mostly layout lock is caching on the local side, so try to match
4171          * it before grabbing layout lock mutex. */
4172         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4173                                LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4174         if (mode != 0) { /* hit cached lock */
4175                 rc = ll_layout_lock_set(&lockh, mode, inode);
4176                 if (rc == -EAGAIN)
4177                         goto again;
4178
4179                 RETURN(rc);
4180         }
4181
4182         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4183                                      0, 0, LUSTRE_OPC_ANY, NULL);
4184         if (IS_ERR(op_data))
4185                 RETURN(PTR_ERR(op_data));
4186
4187         /* have to enqueue one */
4188         memset(&it, 0, sizeof(it));
4189         it.it_op = IT_LAYOUT;
4190
4191         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4192                           ll_get_fsname(inode->i_sb, NULL, 0),
4193                           PFID(&lli->lli_fid), inode);
4194
4195         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4196                             &ll_md_blocking_ast, 0);
4197         if (it.it_request != NULL)
4198                 ptlrpc_req_finished(it.it_request);
4199         it.it_request = NULL;
4200
4201         ll_finish_md_op_data(op_data);
4202
4203         mode = it.it_lock_mode;
4204         it.it_lock_mode = 0;
4205         ll_intent_drop_lock(&it);
4206
4207         if (rc == 0) {
4208                 /* set lock data in case this is a new lock */
4209                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4210                 lockh.cookie = it.it_lock_handle;
4211                 rc = ll_layout_lock_set(&lockh, mode, inode);
4212                 if (rc == -EAGAIN)
4213                         goto again;
4214         }
4215
4216         RETURN(rc);
4217 }
4218
4219 /**
4220  * This function checks if there exists a LAYOUT lock on the client side,
4221  * or enqueues it if it doesn't have one in cache.
4222  *
4223  * This function will not hold layout lock so it may be revoked any time after
4224  * this function returns. Any operations depend on layout should be redone
4225  * in that case.
4226  *
4227  * This function should be called before lov_io_init() to get an uptodate
4228  * layout version, the caller should save the version number and after IO
4229  * is finished, this function should be called again to verify that layout
4230  * is not changed during IO time.
4231  */
4232 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4233 {
4234         struct ll_inode_info    *lli = ll_i2info(inode);
4235         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4236         int rc;
4237         ENTRY;
4238
4239         *gen = ll_layout_version_get(lli);
4240         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4241                 RETURN(0);
4242
4243         /* sanity checks */
4244         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4245         LASSERT(S_ISREG(inode->i_mode));
4246
4247         /* take layout lock mutex to enqueue layout lock exclusively. */
4248         mutex_lock(&lli->lli_layout_mutex);
4249
4250         rc = ll_layout_refresh_locked(inode);
4251         if (rc < 0)
4252                 GOTO(out, rc);
4253
4254         *gen = ll_layout_version_get(lli);
4255 out:
4256         mutex_unlock(&lli->lli_layout_mutex);
4257
4258         RETURN(rc);
4259 }
4260
4261 /**
4262  *  This function send a restore request to the MDT
4263  */
4264 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4265 {
4266         struct hsm_user_request *hur;
4267         int                      len, rc;
4268         ENTRY;
4269
4270         len = sizeof(struct hsm_user_request) +
4271               sizeof(struct hsm_user_item);
4272         OBD_ALLOC(hur, len);
4273         if (hur == NULL)
4274                 RETURN(-ENOMEM);
4275
4276         hur->hur_request.hr_action = HUA_RESTORE;
4277         hur->hur_request.hr_archive_id = 0;
4278         hur->hur_request.hr_flags = 0;
4279         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4280                sizeof(hur->hur_user_item[0].hui_fid));
4281         hur->hur_user_item[0].hui_extent.offset = offset;
4282         hur->hur_user_item[0].hui_extent.length = length;
4283         hur->hur_request.hr_itemcount = 1;
4284         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
4285                            len, hur, NULL);
4286         OBD_FREE(hur, len);
4287         RETURN(rc);
4288 }