lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2016, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48 #include <lustre/ll_fiemap.h>
  49
  50 #include <lustre_ioctl.h>
  51 #include <lustre_swab.h>
  52
  53 #include "cl_object.h"
  54 #include "llite_internal.h"
  55 #include "vvp_internal.h"
  56
  57 static int
  58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  59
  60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  61                           bool *lease_broken);
  62
  63 static enum llioc_iter
  64 ll_iocontrol_call(struct inode *inode, struct file *file,
  65                   unsigned int cmd, unsigned long arg, int *rcp);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_SWAP:
 153                 LASSERT(data != NULL);
 154                 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
 155                 op_data->op_data_version = 0;
 156                 op_data->op_lease_handle = och->och_lease_handle;
 157                 op_data->op_fid2 = *ll_inode2fid(data);
 158                 break;
 159
 160         case MDS_HSM_RELEASE:
 161                 LASSERT(data != NULL);
 162                 op_data->op_bias |= MDS_HSM_RELEASE;
 163                 op_data->op_data_version = *(__u64 *)data;
 164                 op_data->op_lease_handle = och->och_lease_handle;
 165                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 166                 break;
 167
 168         default:
 169                 LASSERT(data == NULL);
 170                 break;
 171         }
 172
 173         rc = md_close(md_exp, op_data, och->och_mod, &req);
 174         if (rc != 0 && rc != -EINTR)
 175                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 176                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 177
 178         if (rc == 0 &&
 179             op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
 180                 struct mdt_body *body;
 181
 182                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 183                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 184                         rc = -EBUSY;
 185         }
 186
 187         ll_finish_md_op_data(op_data);
 188         EXIT;
 189 out:
 190
 191         md_clear_open_replay_data(md_exp, och);
 192         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 193         OBD_FREE_PTR(och);
 194
 195         ptlrpc_req_finished(req);       /* This is close request */
 196         return rc;
 197 }
 198
 199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 200 {
 201         struct ll_inode_info *lli = ll_i2info(inode);
 202         struct obd_client_handle **och_p;
 203         struct obd_client_handle *och;
 204         __u64 *och_usecount;
 205         int rc = 0;
 206         ENTRY;
 207
 208         if (fmode & FMODE_WRITE) {
 209                 och_p = &lli->lli_mds_write_och;
 210                 och_usecount = &lli->lli_open_fd_write_count;
 211         } else if (fmode & FMODE_EXEC) {
 212                 och_p = &lli->lli_mds_exec_och;
 213                 och_usecount = &lli->lli_open_fd_exec_count;
 214         } else {
 215                 LASSERT(fmode & FMODE_READ);
 216                 och_p = &lli->lli_mds_read_och;
 217                 och_usecount = &lli->lli_open_fd_read_count;
 218         }
 219
 220         mutex_lock(&lli->lli_och_mutex);
 221         if (*och_usecount > 0) {
 222                 /* There are still users of this handle, so skip
 223                  * freeing it. */
 224                 mutex_unlock(&lli->lli_och_mutex);
 225                 RETURN(0);
 226         }
 227
 228         och = *och_p;
 229         *och_p = NULL;
 230         mutex_unlock(&lli->lli_och_mutex);
 231
 232         if (och != NULL) {
 233                 /* There might be a race and this handle may already
 234                  * be closed. */
 235                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 236         }
 237
 238         RETURN(rc);
 239 }
 240
 241 static int ll_md_close(struct inode *inode, struct file *file)
 242 {
 243         union ldlm_policy_data policy = {
 244                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 245         };
 246         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 247         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 248         struct ll_inode_info *lli = ll_i2info(inode);
 249         struct lustre_handle lockh;
 250         enum ldlm_mode lockmode;
 251         int rc = 0;
 252         ENTRY;
 253
 254         /* clear group lock, if present */
 255         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 256                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 257
 258         if (fd->fd_lease_och != NULL) {
 259                 bool lease_broken;
 260
 261                 /* Usually the lease is not released when the
 262                  * application crashed, we need to release here. */
 263                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 264                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 265                         PFID(&lli->lli_fid), rc, lease_broken);
 266
 267                 fd->fd_lease_och = NULL;
 268         }
 269
 270         if (fd->fd_och != NULL) {
 271                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 272                 fd->fd_och = NULL;
 273                 GOTO(out, rc);
 274         }
 275
 276         /* Let's see if we have good enough OPEN lock on the file and if
 277            we can skip talking to MDS */
 278         mutex_lock(&lli->lli_och_mutex);
 279         if (fd->fd_omode & FMODE_WRITE) {
 280                 lockmode = LCK_CW;
 281                 LASSERT(lli->lli_open_fd_write_count);
 282                 lli->lli_open_fd_write_count--;
 283         } else if (fd->fd_omode & FMODE_EXEC) {
 284                 lockmode = LCK_PR;
 285                 LASSERT(lli->lli_open_fd_exec_count);
 286                 lli->lli_open_fd_exec_count--;
 287         } else {
 288                 lockmode = LCK_CR;
 289                 LASSERT(lli->lli_open_fd_read_count);
 290                 lli->lli_open_fd_read_count--;
 291         }
 292         mutex_unlock(&lli->lli_och_mutex);
 293
 294         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 295                            LDLM_IBITS, &policy, lockmode, &lockh))
 296                 rc = ll_md_real_close(inode, fd->fd_omode);
 297
 298 out:
 299         LUSTRE_FPRIVATE(file) = NULL;
 300         ll_file_data_put(fd);
 301
 302         RETURN(rc);
 303 }
 304
 305 /* While this returns an error code, fput() the caller does not, so we need
 306  * to make every effort to clean up all of our state here.  Also, applications
 307  * rarely check close errors and even if an error is returned they will not
 308  * re-try the close call.
 309  */
 310 int ll_file_release(struct inode *inode, struct file *file)
 311 {
 312         struct ll_file_data *fd;
 313         struct ll_sb_info *sbi = ll_i2sbi(inode);
 314         struct ll_inode_info *lli = ll_i2info(inode);
 315         int rc;
 316         ENTRY;
 317
 318         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 319                PFID(ll_inode2fid(inode)), inode);
 320
 321         if (inode->i_sb->s_root != file_dentry(file))
 322                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 323         fd = LUSTRE_FPRIVATE(file);
 324         LASSERT(fd != NULL);
 325
 326         /* The last ref on @file, maybe not the the owner pid of statahead,
 327          * because parent and child process can share the same file handle. */
 328         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 329                 ll_deauthorize_statahead(inode, fd);
 330
 331         if (inode->i_sb->s_root == file_dentry(file)) {
 332                 LUSTRE_FPRIVATE(file) = NULL;
 333                 ll_file_data_put(fd);
 334                 RETURN(0);
 335         }
 336
 337         if (!S_ISDIR(inode->i_mode)) {
 338                 if (lli->lli_clob != NULL)
 339                         lov_read_and_clear_async_rc(lli->lli_clob);
 340                 lli->lli_async_rc = 0;
 341         }
 342
 343         rc = ll_md_close(inode, file);
 344
 345         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 346                 libcfs_debug_dumplog();
 347
 348         RETURN(rc);
 349 }
 350
 351 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
 352                                 struct lookup_intent *itp)
 353 {
 354         struct dentry *de = file_dentry(file);
 355         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 356         struct dentry *parent = de->d_parent;
 357         const char *name = NULL;
 358         int len = 0;
 359         struct md_op_data *op_data;
 360         struct ptlrpc_request *req = NULL;
 361         int rc;
 362         ENTRY;
 363
 364         LASSERT(parent != NULL);
 365         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 366
 367         /* if server supports open-by-fid, or file name is invalid, don't pack
 368          * name in open request */
 369         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 370             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 371                 name = de->d_name.name;
 372                 len = de->d_name.len;
 373         }
 374
 375         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 376                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 377         if (IS_ERR(op_data))
 378                 RETURN(PTR_ERR(op_data));
 379         op_data->op_data = lmm;
 380         op_data->op_data_size = lmmsize;
 381
 382         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 383                             &ll_md_blocking_ast, 0);
 384         ll_finish_md_op_data(op_data);
 385         if (rc == -ESTALE) {
 386                 /* reason for keep own exit path - don`t flood log
 387                  * with messages with -ESTALE errors.
 388                  */
 389                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 390                      it_open_error(DISP_OPEN_OPEN, itp))
 391                         GOTO(out, rc);
 392                 ll_release_openhandle(de, itp);
 393                 GOTO(out, rc);
 394         }
 395
 396         if (it_disposition(itp, DISP_LOOKUP_NEG))
 397                 GOTO(out, rc = -ENOENT);
 398
 399         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 400                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 401                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 402                 GOTO(out, rc);
 403         }
 404
 405         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 406         if (!rc && itp->it_lock_mode)
 407                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 408
 409 out:
 410         ptlrpc_req_finished(req);
 411         ll_intent_drop_lock(itp);
 412
 413         /* We did open by fid, but by the time we got to the server,
 414          * the object disappeared. If this is a create, we cannot really
 415          * tell the userspace that the file it was trying to create
 416          * does not exist. Instead let's return -ESTALE, and the VFS will
 417          * retry the create with LOOKUP_REVAL that we are going to catch
 418          * in ll_revalidate_dentry() and use lookup then.
 419          */
 420         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 421                 rc = -ESTALE;
 422
 423         RETURN(rc);
 424 }
 425
 426 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 427                        struct obd_client_handle *och)
 428 {
 429         struct mdt_body *body;
 430
 431         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 432         och->och_fh = body->mbo_handle;
 433         och->och_fid = body->mbo_fid1;
 434         och->och_lease_handle.cookie = it->it_lock_handle;
 435         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 436         och->och_flags = it->it_flags;
 437
 438         return md_set_open_replay_data(md_exp, och, it);
 439 }
 440
 441 static int ll_local_open(struct file *file, struct lookup_intent *it,
 442                          struct ll_file_data *fd, struct obd_client_handle *och)
 443 {
 444         struct inode *inode = file_inode(file);
 445         ENTRY;
 446
 447         LASSERT(!LUSTRE_FPRIVATE(file));
 448
 449         LASSERT(fd != NULL);
 450
 451         if (och) {
 452                 int rc;
 453
 454                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 455                 if (rc != 0)
 456                         RETURN(rc);
 457         }
 458
 459         LUSTRE_FPRIVATE(file) = fd;
 460         ll_readahead_init(inode, &fd->fd_ras);
 461         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 462
 463         /* ll_cl_context initialize */
 464         rwlock_init(&fd->fd_lock);
 465         INIT_LIST_HEAD(&fd->fd_lccs);
 466
 467         RETURN(0);
 468 }
 469
 470 /* Open a file, and (for the very first open) create objects on the OSTs at
 471  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 472  * creation or open until ll_lov_setstripe() ioctl is called.
 473  *
 474  * If we already have the stripe MD locally then we don't request it in
 475  * md_open(), by passing a lmm_size = 0.
 476  *
 477  * It is up to the application to ensure no other processes open this file
 478  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 479  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 480  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 481  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 482  */
 483 int ll_file_open(struct inode *inode, struct file *file)
 484 {
 485         struct ll_inode_info *lli = ll_i2info(inode);
 486         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 487                                           .it_flags = file->f_flags };
 488         struct obd_client_handle **och_p = NULL;
 489         __u64 *och_usecount = NULL;
 490         struct ll_file_data *fd;
 491         int rc = 0;
 492         ENTRY;
 493
 494         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 495                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 496
 497         it = file->private_data; /* XXX: compat macro */
 498         file->private_data = NULL; /* prevent ll_local_open assertion */
 499
 500         fd = ll_file_data_get();
 501         if (fd == NULL)
 502                 GOTO(out_openerr, rc = -ENOMEM);
 503
 504         fd->fd_file = file;
 505         if (S_ISDIR(inode->i_mode))
 506                 ll_authorize_statahead(inode, fd);
 507
 508         if (inode->i_sb->s_root == file_dentry(file)) {
 509                 LUSTRE_FPRIVATE(file) = fd;
 510                 RETURN(0);
 511         }
 512
 513         if (!it || !it->it_disposition) {
 514                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 515                  * because everything but O_ACCMODE mask was stripped from
 516                  * there */
 517                 if ((oit.it_flags + 1) & O_ACCMODE)
 518                         oit.it_flags++;
 519                 if (file->f_flags & O_TRUNC)
 520                         oit.it_flags |= FMODE_WRITE;
 521
 522                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 523                  * dentry_open after call to open_namei that checks permissions.
 524                  * Only nfsd_open call dentry_open directly without checking
 525                  * permissions and because of that this code below is safe. */
 526                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 527                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 528
 529                 /* We do not want O_EXCL here, presumably we opened the file
 530                  * already? XXX - NFS implications? */
 531                 oit.it_flags &= ~O_EXCL;
 532
 533                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 534                  * created if necessary, then "IT_CREAT" should be set to keep
 535                  * consistent with it */
 536                 if (oit.it_flags & O_CREAT)
 537                         oit.it_op |= IT_CREAT;
 538
 539                 it = &oit;
 540         }
 541
 542 restart:
 543         /* Let's see if we have file open on MDS already. */
 544         if (it->it_flags & FMODE_WRITE) {
 545                 och_p = &lli->lli_mds_write_och;
 546                 och_usecount = &lli->lli_open_fd_write_count;
 547         } else if (it->it_flags & FMODE_EXEC) {
 548                 och_p = &lli->lli_mds_exec_och;
 549                 och_usecount = &lli->lli_open_fd_exec_count;
 550          } else {
 551                 och_p = &lli->lli_mds_read_och;
 552                 och_usecount = &lli->lli_open_fd_read_count;
 553         }
 554
 555         mutex_lock(&lli->lli_och_mutex);
 556         if (*och_p) { /* Open handle is present */
 557                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 558                         /* Well, there's extra open request that we do not need,
 559                            let's close it somehow. This will decref request. */
 560                         rc = it_open_error(DISP_OPEN_OPEN, it);
 561                         if (rc) {
 562                                 mutex_unlock(&lli->lli_och_mutex);
 563                                 GOTO(out_openerr, rc);
 564                         }
 565
 566                         ll_release_openhandle(file_dentry(file), it);
 567                 }
 568                 (*och_usecount)++;
 569
 570                 rc = ll_local_open(file, it, fd, NULL);
 571                 if (rc) {
 572                         (*och_usecount)--;
 573                         mutex_unlock(&lli->lli_och_mutex);
 574                         GOTO(out_openerr, rc);
 575                 }
 576         } else {
 577                 LASSERT(*och_usecount == 0);
 578                 if (!it->it_disposition) {
 579                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 580                         /* We cannot just request lock handle now, new ELC code
 581                            means that one of other OPEN locks for this file
 582                            could be cancelled, and since blocking ast handler
 583                            would attempt to grab och_mutex as well, that would
 584                            result in a deadlock */
 585                         mutex_unlock(&lli->lli_och_mutex);
 586                         /*
 587                          * Normally called under two situations:
 588                          * 1. NFS export.
 589                          * 2. A race/condition on MDS resulting in no open
 590                          *    handle to be returned from LOOKUP|OPEN request,
 591                          *    for example if the target entry was a symlink.
 592                          *
 593                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 594                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 595                          *  bit so that it's not confusing later callers.
 596                          *
 597                          *  NB; when ldd is NULL, it must have come via normal
 598                          *  lookup path only, since ll_iget_for_nfs always calls
 599                          *  ll_d_init().
 600                          */
 601                         if (ldd && ldd->lld_nfs_dentry) {
 602                                 ldd->lld_nfs_dentry = 0;
 603                                 it->it_flags |= MDS_OPEN_LOCK;
 604                         }
 605
 606                          /*
 607                          * Always specify MDS_OPEN_BY_FID because we don't want
 608                          * to get file with different fid.
 609                          */
 610                         it->it_flags |= MDS_OPEN_BY_FID;
 611                         rc = ll_intent_file_open(file, NULL, 0, it);
 612                         if (rc)
 613                                 GOTO(out_openerr, rc);
 614
 615                         goto restart;
 616                 }
 617                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 618                 if (!*och_p)
 619                         GOTO(out_och_free, rc = -ENOMEM);
 620
 621                 (*och_usecount)++;
 622
 623                 /* md_intent_lock() didn't get a request ref if there was an
 624                  * open error, so don't do cleanup on the request here
 625                  * (bug 3430) */
 626                 /* XXX (green): Should not we bail out on any error here, not
 627                  * just open error? */
 628                 rc = it_open_error(DISP_OPEN_OPEN, it);
 629                 if (rc != 0)
 630                         GOTO(out_och_free, rc);
 631
 632                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 633                          "inode %p: disposition %x, status %d\n", inode,
 634                          it_disposition(it, ~0), it->it_status);
 635
 636                 rc = ll_local_open(file, it, fd, *och_p);
 637                 if (rc)
 638                         GOTO(out_och_free, rc);
 639         }
 640         mutex_unlock(&lli->lli_och_mutex);
 641         fd = NULL;
 642
 643         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 644            different kind of OPEN lock for this same inode gets cancelled
 645            by ldlm_cancel_lru */
 646         if (!S_ISREG(inode->i_mode))
 647                 GOTO(out_och_free, rc);
 648
 649         cl_lov_delay_create_clear(&file->f_flags);
 650         GOTO(out_och_free, rc);
 651
 652 out_och_free:
 653         if (rc) {
 654                 if (och_p && *och_p) {
 655                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 656                         *och_p = NULL; /* OBD_FREE writes some magic there */
 657                         (*och_usecount)--;
 658                 }
 659                 mutex_unlock(&lli->lli_och_mutex);
 660
 661 out_openerr:
 662                 if (lli->lli_opendir_key == fd)
 663                         ll_deauthorize_statahead(inode, fd);
 664                 if (fd != NULL)
 665                         ll_file_data_put(fd);
 666         } else {
 667                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 668         }
 669
 670         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 671                 ptlrpc_req_finished(it->it_request);
 672                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 673         }
 674
 675         return rc;
 676 }
 677
 678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 679                         struct ldlm_lock_desc *desc, void *data, int flag)
 680 {
 681         int rc;
 682         struct lustre_handle lockh;
 683         ENTRY;
 684
 685         switch (flag) {
 686         case LDLM_CB_BLOCKING:
 687                 ldlm_lock2handle(lock, &lockh);
 688                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 689                 if (rc < 0) {
 690                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 691                         RETURN(rc);
 692                 }
 693                 break;
 694         case LDLM_CB_CANCELING:
 695                 /* do nothing */
 696                 break;
 697         }
 698         RETURN(0);
 699 }
 700
 701 /**
 702  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 703  * and save it as fd->fd_och so as to force client to reopen the file even
 704  * if it has an open lock in cache already.
 705  */
 706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 707                                 struct lustre_handle *old_handle)
 708 {
 709         struct ll_inode_info *lli = ll_i2info(inode);
 710         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 711         struct obd_client_handle **och_p;
 712         __u64 *och_usecount;
 713         int rc = 0;
 714         ENTRY;
 715
 716         /* Get the openhandle of the file */
 717         mutex_lock(&lli->lli_och_mutex);
 718         if (fd->fd_lease_och != NULL)
 719                 GOTO(out_unlock, rc = -EBUSY);
 720
 721         if (fd->fd_och == NULL) {
 722                 if (file->f_mode & FMODE_WRITE) {
 723                         LASSERT(lli->lli_mds_write_och != NULL);
 724                         och_p = &lli->lli_mds_write_och;
 725                         och_usecount = &lli->lli_open_fd_write_count;
 726                 } else {
 727                         LASSERT(lli->lli_mds_read_och != NULL);
 728                         och_p = &lli->lli_mds_read_och;
 729                         och_usecount = &lli->lli_open_fd_read_count;
 730                 }
 731
 732                 if (*och_usecount > 1)
 733                         GOTO(out_unlock, rc = -EBUSY);
 734
 735                 fd->fd_och = *och_p;
 736                 *och_usecount = 0;
 737                 *och_p = NULL;
 738         }
 739
 740         *old_handle = fd->fd_och->och_fh;
 741
 742         EXIT;
 743 out_unlock:
 744         mutex_unlock(&lli->lli_och_mutex);
 745         return rc;
 746 }
 747
 748 /**
 749  * Release ownership on lli_mds_*_och when putting back a file lease.
 750  */
 751 static int ll_lease_och_release(struct inode *inode, struct file *file)
 752 {
 753         struct ll_inode_info *lli = ll_i2info(inode);
 754         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 755         struct obd_client_handle **och_p;
 756         struct obd_client_handle *old_och = NULL;
 757         __u64 *och_usecount;
 758         int rc = 0;
 759         ENTRY;
 760
 761         mutex_lock(&lli->lli_och_mutex);
 762         if (file->f_mode & FMODE_WRITE) {
 763                 och_p = &lli->lli_mds_write_och;
 764                 och_usecount = &lli->lli_open_fd_write_count;
 765         } else {
 766                 och_p = &lli->lli_mds_read_och;
 767                 och_usecount = &lli->lli_open_fd_read_count;
 768         }
 769
 770         /* The file may have been open by another process (broken lease) so
 771          * *och_p is not NULL. In this case we should simply increase usecount
 772          * and close fd_och.
 773          */
 774         if (*och_p != NULL) {
 775                 old_och = fd->fd_och;
 776                 (*och_usecount)++;
 777         } else {
 778                 *och_p = fd->fd_och;
 779                 *och_usecount = 1;
 780         }
 781         fd->fd_och = NULL;
 782         mutex_unlock(&lli->lli_och_mutex);
 783
 784         if (old_och != NULL)
 785                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 786
 787         RETURN(rc);
 788 }
 789
 790 /**
 791  * Acquire a lease and open the file.
 792  */
 793 static struct obd_client_handle *
 794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 795               __u64 open_flags)
 796 {
 797         struct lookup_intent it = { .it_op = IT_OPEN };
 798         struct ll_sb_info *sbi = ll_i2sbi(inode);
 799         struct md_op_data *op_data;
 800         struct ptlrpc_request *req = NULL;
 801         struct lustre_handle old_handle = { 0 };
 802         struct obd_client_handle *och = NULL;
 803         int rc;
 804         int rc2;
 805         ENTRY;
 806
 807         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 808                 RETURN(ERR_PTR(-EINVAL));
 809
 810         if (file != NULL) {
 811                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 812                         RETURN(ERR_PTR(-EPERM));
 813
 814                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 815                 if (rc)
 816                         RETURN(ERR_PTR(rc));
 817         }
 818
 819         OBD_ALLOC_PTR(och);
 820         if (och == NULL)
 821                 RETURN(ERR_PTR(-ENOMEM));
 822
 823         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 824                                         LUSTRE_OPC_ANY, NULL);
 825         if (IS_ERR(op_data))
 826                 GOTO(out, rc = PTR_ERR(op_data));
 827
 828         /* To tell the MDT this openhandle is from the same owner */
 829         op_data->op_handle = old_handle;
 830
 831         it.it_flags = fmode | open_flags;
 832         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 833         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 834                             &ll_md_blocking_lease_ast,
 835         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 836          * it can be cancelled which may mislead applications that the lease is
 837          * broken;
 838          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 839          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 840          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 841                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 842         ll_finish_md_op_data(op_data);
 843         ptlrpc_req_finished(req);
 844         if (rc < 0)
 845                 GOTO(out_release_it, rc);
 846
 847         if (it_disposition(&it, DISP_LOOKUP_NEG))
 848                 GOTO(out_release_it, rc = -ENOENT);
 849
 850         rc = it_open_error(DISP_OPEN_OPEN, &it);
 851         if (rc)
 852                 GOTO(out_release_it, rc);
 853
 854         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 855         ll_och_fill(sbi->ll_md_exp, &it, och);
 856
 857         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 858                 GOTO(out_close, rc = -EOPNOTSUPP);
 859
 860         /* already get lease, handle lease lock */
 861         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 862         if (it.it_lock_mode == 0 ||
 863             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 864                 /* open lock must return for lease */
 865                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 866                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 867                         it.it_lock_bits);
 868                 GOTO(out_close, rc = -EPROTO);
 869         }
 870
 871         ll_intent_release(&it);
 872         RETURN(och);
 873
 874 out_close:
 875         /* Cancel open lock */
 876         if (it.it_lock_mode != 0) {
 877                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 878                                             it.it_lock_mode);
 879                 it.it_lock_mode = 0;
 880                 och->och_lease_handle.cookie = 0ULL;
 881         }
 882         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 883         if (rc2 < 0)
 884                 CERROR("%s: error closing file "DFID": %d\n",
 885                        ll_get_fsname(inode->i_sb, NULL, 0),
 886                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 887         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 888 out_release_it:
 889         ll_intent_release(&it);
 890 out:
 891         if (och != NULL)
 892                 OBD_FREE_PTR(och);
 893         RETURN(ERR_PTR(rc));
 894 }
 895
 896 /**
 897  * Check whether a layout swap can be done between two inodes.
 898  *
 899  * \param[in] inode1  First inode to check
 900  * \param[in] inode2  Second inode to check
 901  *
 902  * \retval 0 on success, layout swap can be performed between both inodes
 903  * \retval negative error code if requirements are not met
 904  */
 905 static int ll_check_swap_layouts_validity(struct inode *inode1,
 906                                           struct inode *inode2)
 907 {
 908         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 909                 return -EINVAL;
 910
 911         if (inode_permission(inode1, MAY_WRITE) ||
 912             inode_permission(inode2, MAY_WRITE))
 913                 return -EPERM;
 914
 915         if (inode1->i_sb != inode2->i_sb)
 916                 return -EXDEV;
 917
 918         return 0;
 919 }
 920
 921 static int ll_swap_layouts_close(struct obd_client_handle *och,
 922                                  struct inode *inode, struct inode *inode2)
 923 {
 924         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 925         const struct lu_fid     *fid2;
 926         int                      rc;
 927         ENTRY;
 928
 929         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 930                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 931
 932         rc = ll_check_swap_layouts_validity(inode, inode2);
 933         if (rc < 0)
 934                 GOTO(out_free_och, rc);
 935
 936         /* We now know that inode2 is a lustre inode */
 937         fid2 = ll_inode2fid(inode2);
 938
 939         rc = lu_fid_cmp(fid1, fid2);
 940         if (rc == 0)
 941                 GOTO(out_free_och, rc = -EINVAL);
 942
 943         /* Close the file and swap layouts between inode & inode2.
 944          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 945          * because we still need it to pack l_remote_handle to MDT. */
 946         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 947                                        inode2);
 948
 949         och = NULL; /* freed in ll_close_inode_openhandle() */
 950
 951 out_free_och:
 952         if (och != NULL)
 953                 OBD_FREE_PTR(och);
 954
 955         RETURN(rc);
 956 }
 957
 958 /**
 959  * Release lease and close the file.
 960  * It will check if the lease has ever broken.
 961  */
 962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 963                           bool *lease_broken)
 964 {
 965         struct ldlm_lock *lock;
 966         bool cancelled = true;
 967         int rc;
 968         ENTRY;
 969
 970         lock = ldlm_handle2lock(&och->och_lease_handle);
 971         if (lock != NULL) {
 972                 lock_res_and_lock(lock);
 973                 cancelled = ldlm_is_cancel(lock);
 974                 unlock_res_and_lock(lock);
 975                 LDLM_LOCK_PUT(lock);
 976         }
 977
 978         CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 979                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 980
 981         if (!cancelled)
 982                 ldlm_cli_cancel(&och->och_lease_handle, 0);
 983
 984         if (lease_broken != NULL)
 985                 *lease_broken = cancelled;
 986
 987         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 988         RETURN(rc);
 989 }
 990
 991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 992 {
 993         struct ll_inode_info *lli = ll_i2info(inode);
 994         struct cl_object *obj = lli->lli_clob;
 995         struct cl_attr *attr = vvp_env_thread_attr(env);
 996         s64 atime;
 997         s64 mtime;
 998         s64 ctime;
 999         int rc = 0;
1000
1001         ENTRY;
1002
1003         ll_inode_size_lock(inode);
1004
1005         /* Merge timestamps the most recently obtained from MDS with
1006          * timestamps obtained from OSTs.
1007          *
1008          * Do not overwrite atime of inode because it may be refreshed
1009          * by file_accessed() function. If the read was served by cache
1010          * data, there is no RPC to be sent so that atime may not be
1011          * transferred to OSTs at all. MDT only updates atime at close time
1012          * if it's at least 'mdd.*.atime_diff' older.
1013          * All in all, the atime in Lustre does not strictly comply with
1014          * POSIX. Solving this problem needs to send an RPC to MDT for each
1015          * read, this will hurt performance. */
1016         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017                 LTIME_S(inode->i_atime) = lli->lli_atime;
1018                 lli->lli_update_atime = 0;
1019         }
1020         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1022
1023         atime = LTIME_S(inode->i_atime);
1024         mtime = LTIME_S(inode->i_mtime);
1025         ctime = LTIME_S(inode->i_ctime);
1026
1027         cl_object_attr_lock(obj);
1028         rc = cl_object_attr_get(env, obj, attr);
1029         cl_object_attr_unlock(obj);
1030
1031         if (rc != 0)
1032                 GOTO(out_size_unlock, rc);
1033
1034         if (atime < attr->cat_atime)
1035                 atime = attr->cat_atime;
1036
1037         if (ctime < attr->cat_ctime)
1038                 ctime = attr->cat_ctime;
1039
1040         if (mtime < attr->cat_mtime)
1041                 mtime = attr->cat_mtime;
1042
1043         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044                PFID(&lli->lli_fid), attr->cat_size);
1045
1046         i_size_write(inode, attr->cat_size);
1047         inode->i_blocks = attr->cat_blocks;
1048
1049         LTIME_S(inode->i_atime) = atime;
1050         LTIME_S(inode->i_mtime) = mtime;
1051         LTIME_S(inode->i_ctime) = ctime;
1052
1053 out_size_unlock:
1054         ll_inode_size_unlock(inode);
1055
1056         RETURN(rc);
1057 }
1058
1059 static bool file_is_noatime(const struct file *file)
1060 {
1061         const struct vfsmount *mnt = file->f_path.mnt;
1062         const struct inode *inode = file_inode((struct file *)file);
1063
1064         /* Adapted from file_accessed() and touch_atime().*/
1065         if (file->f_flags & O_NOATIME)
1066                 return true;
1067
1068         if (inode->i_flags & S_NOATIME)
1069                 return true;
1070
1071         if (IS_NOATIME(inode))
1072                 return true;
1073
1074         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1075                 return true;
1076
1077         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1078                 return true;
1079
1080         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1081                 return true;
1082
1083         return false;
1084 }
1085
1086 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1087 {
1088         struct inode *inode = file_inode((struct file *)file);
1089
1090         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1091         if (write) {
1092                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1093                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1094                                       file->f_flags & O_DIRECT ||
1095                                       IS_SYNC(inode);
1096         }
1097         io->ci_obj     = ll_i2info(inode)->lli_clob;
1098         io->ci_lockreq = CILR_MAYBE;
1099         if (ll_file_nolock(file)) {
1100                 io->ci_lockreq = CILR_NEVER;
1101                 io->ci_no_srvlock = 1;
1102         } else if (file->f_flags & O_APPEND) {
1103                 io->ci_lockreq = CILR_MANDATORY;
1104         }
1105
1106         io->ci_noatime = file_is_noatime(file);
1107 }
1108
1109 static ssize_t
1110 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1111                    struct file *file, enum cl_io_type iot,
1112                    loff_t *ppos, size_t count)
1113 {
1114         struct vvp_io           *vio = vvp_env_io(env);
1115         struct inode            *inode = file_inode(file);
1116         struct ll_inode_info    *lli = ll_i2info(inode);
1117         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1118         struct cl_io            *io;
1119         ssize_t                 result = 0;
1120         int                     rc = 0;
1121         struct range_lock       range;
1122
1123         ENTRY;
1124
1125         CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1126                 file_dentry(file)->d_name.name, iot, *ppos, count);
1127
1128 restart:
1129         io = vvp_env_thread_io(env);
1130         ll_io_init(io, file, iot == CIT_WRITE);
1131
1132         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1133                 bool range_locked = false;
1134
1135                 if (file->f_flags & O_APPEND)
1136                         range_lock_init(&range, 0, LUSTRE_EOF);
1137                 else
1138                         range_lock_init(&range, *ppos, *ppos + count - 1);
1139
1140                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1141                 vio->vui_io_subtype = args->via_io_subtype;
1142
1143                 switch (vio->vui_io_subtype) {
1144                 case IO_NORMAL:
1145                         vio->vui_iter = args->u.normal.via_iter;
1146                         vio->vui_iocb = args->u.normal.via_iocb;
1147                         /* Direct IO reads must also take range lock,
1148                          * or multiple reads will try to work on the same pages
1149                          * See LU-6227 for details. */
1150                         if (((iot == CIT_WRITE) ||
1151                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1152                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1153                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1154                                        RL_PARA(&range));
1155                                 rc = range_lock(&lli->lli_write_tree, &range);
1156                                 if (rc < 0)
1157                                         GOTO(out, rc);
1158
1159                                 range_locked = true;
1160                         }
1161                         break;
1162                 case IO_SPLICE:
1163                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1164                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1165                         break;
1166                 default:
1167                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1168                         LBUG();
1169                 }
1170
1171                 ll_cl_add(file, env, io, LCC_RW);
1172                 rc = cl_io_loop(env, io);
1173                 ll_cl_remove(file, env);
1174
1175                 if (range_locked) {
1176                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1177                                RL_PARA(&range));
1178                         range_unlock(&lli->lli_write_tree, &range);
1179                 }
1180         } else {
1181                 /* cl_io_rw_init() handled IO */
1182                 rc = io->ci_result;
1183         }
1184
1185         if (io->ci_nob > 0) {
1186                 result += io->ci_nob;
1187                 count -= io->ci_nob;
1188                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1189
1190                 /* prepare IO restart */
1191                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1192                         args->u.normal.via_iter = vio->vui_iter;
1193         }
1194         GOTO(out, rc);
1195 out:
1196         cl_io_fini(env, io);
1197
1198         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1199                 CDEBUG(D_VFSTRACE,
1200                        "%s: restart %s from %lld, count:%zu, result: %zd\n",
1201                        file_dentry(file)->d_name.name,
1202                        iot == CIT_READ ? "read" : "write",
1203                        *ppos, count, result);
1204                 goto restart;
1205         }
1206
1207         if (iot == CIT_READ) {
1208                 if (result > 0)
1209                         ll_stats_ops_tally(ll_i2sbi(inode),
1210                                            LPROC_LL_READ_BYTES, result);
1211         } else if (iot == CIT_WRITE) {
1212                 if (result > 0) {
1213                         ll_stats_ops_tally(ll_i2sbi(inode),
1214                                            LPROC_LL_WRITE_BYTES, result);
1215                         fd->fd_write_failed = false;
1216                 } else if (result == 0 && rc == 0) {
1217                         rc = io->ci_result;
1218                         if (rc < 0)
1219                                 fd->fd_write_failed = true;
1220                         else
1221                                 fd->fd_write_failed = false;
1222                 } else if (rc != -ERESTARTSYS) {
1223                         fd->fd_write_failed = true;
1224                 }
1225         }
1226
1227         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1228
1229         return result > 0 ? result : rc;
1230 }
1231
1232 /**
1233  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1234  * especially for small I/O.
1235  *
1236  * To serve a read request, CLIO has to create and initialize a cl_io and
1237  * then request DLM lock. This has turned out to have siginificant overhead
1238  * and affects the performance of small I/O dramatically.
1239  *
1240  * It's not necessary to create a cl_io for each I/O. Under the help of read
1241  * ahead, most of the pages being read are already in memory cache and we can
1242  * read those pages directly because if the pages exist, the corresponding DLM
1243  * lock must exist so that page content must be valid.
1244  *
1245  * In fast read implementation, the llite speculatively finds and reads pages
1246  * in memory cache. There are three scenarios for fast read:
1247  *   - If the page exists and is uptodate, kernel VM will provide the data and
1248  *     CLIO won't be intervened;
1249  *   - If the page was brought into memory by read ahead, it will be exported
1250  *     and read ahead parameters will be updated;
1251  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1252  *     it will go back and invoke normal read, i.e., a cl_io will be created
1253  *     and DLM lock will be requested.
1254  *
1255  * POSIX compliance: posix standard states that read is intended to be atomic.
1256  * Lustre read implementation is in line with Linux kernel read implementation
1257  * and neither of them complies with POSIX standard in this matter. Fast read
1258  * doesn't make the situation worse on single node but it may interleave write
1259  * results from multiple nodes due to short read handling in ll_file_aio_read().
1260  *
1261  * \param env - lu_env
1262  * \param iocb - kiocb from kernel
1263  * \param iter - user space buffers where the data will be copied
1264  *
1265  * \retval - number of bytes have been read, or error code if error occurred.
1266  */
1267 static ssize_t
1268 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1269                 struct iov_iter *iter)
1270 {
1271         ssize_t result;
1272
1273         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1274                 return 0;
1275
1276         /* NB: we can't do direct IO for fast read because it will need a lock
1277          * to make IO engine happy. */
1278         if (iocb->ki_filp->f_flags & O_DIRECT)
1279                 return 0;
1280
1281         ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1282         result = generic_file_read_iter(iocb, iter);
1283         ll_cl_remove(iocb->ki_filp, env);
1284
1285         /* If the first page is not in cache, generic_file_aio_read() will be
1286          * returned with -ENODATA.
1287          * See corresponding code in ll_readpage(). */
1288         if (result == -ENODATA)
1289                 result = 0;
1290
1291         if (result > 0)
1292                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1293                                 LPROC_LL_READ_BYTES, result);
1294
1295         return result;
1296 }
1297
1298 /*
1299  * Read from a file (through the page cache).
1300  */
1301 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1302 {
1303         struct lu_env *env;
1304         struct vvp_io_args *args;
1305         ssize_t result;
1306         ssize_t rc2;
1307         __u16 refcheck;
1308
1309         env = cl_env_get(&refcheck);
1310         if (IS_ERR(env))
1311                 return PTR_ERR(env);
1312
1313         result = ll_do_fast_read(env, iocb, to);
1314         if (result < 0 || iov_iter_count(to) == 0)
1315                 GOTO(out, result);
1316
1317         args = ll_env_args(env, IO_NORMAL);
1318         args->u.normal.via_iter = to;
1319         args->u.normal.via_iocb = iocb;
1320
1321         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1322                                  &iocb->ki_pos, iov_iter_count(to));
1323         if (rc2 > 0)
1324                 result += rc2;
1325         else if (result == 0)
1326                 result = rc2;
1327
1328 out:
1329         cl_env_put(env, &refcheck);
1330         return result;
1331 }
1332
1333 /*
1334  * Write to a file (through the page cache).
1335  */
1336 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1337 {
1338         struct vvp_io_args *args;
1339         struct lu_env *env;
1340         ssize_t result;
1341         __u16 refcheck;
1342
1343         env = cl_env_get(&refcheck);
1344         if (IS_ERR(env))
1345                 return PTR_ERR(env);
1346
1347         args = ll_env_args(env, IO_NORMAL);
1348         args->u.normal.via_iter = from;
1349         args->u.normal.via_iocb = iocb;
1350
1351         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1352                                     &iocb->ki_pos, iov_iter_count(from));
1353         cl_env_put(env, &refcheck);
1354         return result;
1355 }
1356
1357 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1358 /*
1359  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1360  */
1361 static int ll_file_get_iov_count(const struct iovec *iov,
1362                                  unsigned long *nr_segs, size_t *count)
1363 {
1364         size_t cnt = 0;
1365         unsigned long seg;
1366
1367         for (seg = 0; seg < *nr_segs; seg++) {
1368                 const struct iovec *iv = &iov[seg];
1369
1370                 /*
1371                  * If any segment has a negative length, or the cumulative
1372                  * length ever wraps negative then return -EINVAL.
1373                  */
1374                 cnt += iv->iov_len;
1375                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1376                         return -EINVAL;
1377                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1378                         continue;
1379                 if (seg == 0)
1380                         return -EFAULT;
1381                 *nr_segs = seg;
1382                 cnt -= iv->iov_len;     /* This segment is no good */
1383                 break;
1384         }
1385         *count = cnt;
1386         return 0;
1387 }
1388
1389 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1390                                 unsigned long nr_segs, loff_t pos)
1391 {
1392         struct iov_iter to;
1393         size_t iov_count;
1394         ssize_t result;
1395         ENTRY;
1396
1397         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1398         if (result)
1399                 RETURN(result);
1400
1401 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1402         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1403 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1404         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1405 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1406
1407         result = ll_file_read_iter(iocb, &to);
1408
1409         RETURN(result);
1410 }
1411
1412 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1413                             loff_t *ppos)
1414 {
1415         struct lu_env *env;
1416         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1417         struct kiocb  *kiocb;
1418         ssize_t        result;
1419         __u16          refcheck;
1420         ENTRY;
1421
1422         env = cl_env_get(&refcheck);
1423         if (IS_ERR(env))
1424                 RETURN(PTR_ERR(env));
1425
1426         kiocb = &ll_env_info(env)->lti_kiocb;
1427         init_sync_kiocb(kiocb, file);
1428         kiocb->ki_pos = *ppos;
1429 #ifdef HAVE_KIOCB_KI_LEFT
1430         kiocb->ki_left = count;
1431 #elif defined(HAVE_KI_NBYTES)
1432         kiocb->ki_nbytes = count;
1433 #endif
1434
1435         result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1436         *ppos = kiocb->ki_pos;
1437
1438         cl_env_put(env, &refcheck);
1439         RETURN(result);
1440 }
1441
1442 /*
1443  * Write to a file (through the page cache).
1444  * AIO stuff
1445  */
1446 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1447                                  unsigned long nr_segs, loff_t pos)
1448 {
1449         struct iov_iter from;
1450         size_t iov_count;
1451         ssize_t result;
1452         ENTRY;
1453
1454         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1455         if (result)
1456                 RETURN(result);
1457
1458 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1459         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1460 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1461         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1462 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1463
1464         result = ll_file_write_iter(iocb, &from);
1465
1466         RETURN(result);
1467 }
1468
1469 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1470                              size_t count, loff_t *ppos)
1471 {
1472         struct lu_env *env;
1473         struct iovec   iov = { .iov_base = (void __user *)buf,
1474                                .iov_len = count };
1475         struct kiocb  *kiocb;
1476         ssize_t        result;
1477         __u16          refcheck;
1478         ENTRY;
1479
1480         env = cl_env_get(&refcheck);
1481         if (IS_ERR(env))
1482                 RETURN(PTR_ERR(env));
1483
1484         kiocb = &ll_env_info(env)->lti_kiocb;
1485         init_sync_kiocb(kiocb, file);
1486         kiocb->ki_pos = *ppos;
1487 #ifdef HAVE_KIOCB_KI_LEFT
1488         kiocb->ki_left = count;
1489 #elif defined(HAVE_KI_NBYTES)
1490         kiocb->ki_nbytes = count;
1491 #endif
1492
1493         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1494         *ppos = kiocb->ki_pos;
1495
1496         cl_env_put(env, &refcheck);
1497         RETURN(result);
1498 }
1499 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1500
1501 /*
1502  * Send file content (through pagecache) somewhere with helper
1503  */
1504 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1505                                    struct pipe_inode_info *pipe, size_t count,
1506                                    unsigned int flags)
1507 {
1508         struct lu_env      *env;
1509         struct vvp_io_args *args;
1510         ssize_t             result;
1511         __u16               refcheck;
1512         ENTRY;
1513
1514         env = cl_env_get(&refcheck);
1515         if (IS_ERR(env))
1516                 RETURN(PTR_ERR(env));
1517
1518         args = ll_env_args(env, IO_SPLICE);
1519         args->u.splice.via_pipe = pipe;
1520         args->u.splice.via_flags = flags;
1521
1522         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1523         cl_env_put(env, &refcheck);
1524         RETURN(result);
1525 }
1526
1527 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1528                              __u64  flags, struct lov_user_md *lum,
1529                              int lum_size)
1530 {
1531         struct lookup_intent oit = {
1532                 .it_op = IT_OPEN,
1533                 .it_flags = flags | MDS_OPEN_BY_FID,
1534         };
1535         int rc;
1536         ENTRY;
1537
1538         ll_inode_size_lock(inode);
1539         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1540         if (rc < 0)
1541                 GOTO(out_unlock, rc);
1542
1543         ll_release_openhandle(file_dentry(file), &oit);
1544
1545 out_unlock:
1546         ll_inode_size_unlock(inode);
1547         ll_intent_release(&oit);
1548         cl_lov_delay_create_clear(&file->f_flags);
1549
1550         RETURN(rc);
1551 }
1552
1553 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1554                              struct lov_mds_md **lmmp, int *lmm_size,
1555                              struct ptlrpc_request **request)
1556 {
1557         struct ll_sb_info *sbi = ll_i2sbi(inode);
1558         struct mdt_body  *body;
1559         struct lov_mds_md *lmm = NULL;
1560         struct ptlrpc_request *req = NULL;
1561         struct md_op_data *op_data;
1562         int rc, lmmsize;
1563
1564         rc = ll_get_default_mdsize(sbi, &lmmsize);
1565         if (rc)
1566                 RETURN(rc);
1567
1568         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1569                                      strlen(filename), lmmsize,
1570                                      LUSTRE_OPC_ANY, NULL);
1571         if (IS_ERR(op_data))
1572                 RETURN(PTR_ERR(op_data));
1573
1574         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1575         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1576         ll_finish_md_op_data(op_data);
1577         if (rc < 0) {
1578                 CDEBUG(D_INFO, "md_getattr_name failed "
1579                        "on %s: rc %d\n", filename, rc);
1580                 GOTO(out, rc);
1581         }
1582
1583         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1584         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1585
1586         lmmsize = body->mbo_eadatasize;
1587
1588         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1589                         lmmsize == 0) {
1590                 GOTO(out, rc = -ENODATA);
1591         }
1592
1593         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1594         LASSERT(lmm != NULL);
1595
1596         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1597             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1598                 GOTO(out, rc = -EPROTO);
1599         }
1600
1601         /*
1602          * This is coming from the MDS, so is probably in
1603          * little endian.  We convert it to host endian before
1604          * passing it to userspace.
1605          */
1606         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1607                 int stripe_count;
1608
1609                 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1610                 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1611                         stripe_count = 0;
1612
1613                 /* if function called for directory - we should
1614                  * avoid swab not existent lsm objects */
1615                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1616                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1617                         if (S_ISREG(body->mbo_mode))
1618                                 lustre_swab_lov_user_md_objects(
1619                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1620                                     stripe_count);
1621                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1622                         lustre_swab_lov_user_md_v3(
1623                                 (struct lov_user_md_v3 *)lmm);
1624                         if (S_ISREG(body->mbo_mode))
1625                                 lustre_swab_lov_user_md_objects(
1626                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1627                                  stripe_count);
1628                 }
1629         }
1630
1631 out:
1632         *lmmp = lmm;
1633         *lmm_size = lmmsize;
1634         *request = req;
1635         return rc;
1636 }
1637
1638 static int ll_lov_setea(struct inode *inode, struct file *file,
1639                             unsigned long arg)
1640 {
1641         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1642         struct lov_user_md      *lump;
1643         int                      lum_size = sizeof(struct lov_user_md) +
1644                                             sizeof(struct lov_user_ost_data);
1645         int                      rc;
1646         ENTRY;
1647
1648         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1649                 RETURN(-EPERM);
1650
1651         OBD_ALLOC_LARGE(lump, lum_size);
1652         if (lump == NULL)
1653                 RETURN(-ENOMEM);
1654
1655         if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1656                 GOTO(out_lump, rc = -EFAULT);
1657
1658         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1659
1660 out_lump:
1661         OBD_FREE_LARGE(lump, lum_size);
1662         RETURN(rc);
1663 }
1664
1665 static int ll_file_getstripe(struct inode *inode,
1666                              struct lov_user_md __user *lum)
1667 {
1668         struct lu_env   *env;
1669         __u16           refcheck;
1670         int             rc;
1671         ENTRY;
1672
1673         env = cl_env_get(&refcheck);
1674         if (IS_ERR(env))
1675                 RETURN(PTR_ERR(env));
1676
1677         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1678         cl_env_put(env, &refcheck);
1679         RETURN(rc);
1680 }
1681
1682 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1683                             unsigned long arg)
1684 {
1685         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1686         struct lov_user_md        *klum;
1687         int                        lum_size, rc;
1688         __u64                      flags = FMODE_WRITE;
1689         ENTRY;
1690
1691         rc = ll_copy_user_md(lum, &klum);
1692         if (rc < 0)
1693                 RETURN(rc);
1694
1695         lum_size = rc;
1696         rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1697         if (rc == 0) {
1698                 __u32 gen;
1699
1700                 put_user(0, &lum->lmm_stripe_count);
1701
1702                 ll_layout_refresh(inode, &gen);
1703                 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1704         }
1705
1706         OBD_FREE(klum, lum_size);
1707         RETURN(rc);
1708 }
1709
1710 static int
1711 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1712 {
1713         struct ll_inode_info   *lli = ll_i2info(inode);
1714         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1715         struct ll_grouplock     grouplock;
1716         int                     rc;
1717         ENTRY;
1718
1719         if (arg == 0) {
1720                 CWARN("group id for group lock must not be 0\n");
1721                 RETURN(-EINVAL);
1722         }
1723
1724         if (ll_file_nolock(file))
1725                 RETURN(-EOPNOTSUPP);
1726
1727         spin_lock(&lli->lli_lock);
1728         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1729                 CWARN("group lock already existed with gid %lu\n",
1730                       fd->fd_grouplock.lg_gid);
1731                 spin_unlock(&lli->lli_lock);
1732                 RETURN(-EINVAL);
1733         }
1734         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1735         spin_unlock(&lli->lli_lock);
1736
1737         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1738                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1739         if (rc)
1740                 RETURN(rc);
1741
1742         spin_lock(&lli->lli_lock);
1743         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1744                 spin_unlock(&lli->lli_lock);
1745                 CERROR("another thread just won the race\n");
1746                 cl_put_grouplock(&grouplock);
1747                 RETURN(-EINVAL);
1748         }
1749
1750         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1751         fd->fd_grouplock = grouplock;
1752         spin_unlock(&lli->lli_lock);
1753
1754         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1755         RETURN(0);
1756 }
1757
1758 static int ll_put_grouplock(struct inode *inode, struct file *file,
1759                             unsigned long arg)
1760 {
1761         struct ll_inode_info   *lli = ll_i2info(inode);
1762         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1763         struct ll_grouplock     grouplock;
1764         ENTRY;
1765
1766         spin_lock(&lli->lli_lock);
1767         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1768                 spin_unlock(&lli->lli_lock);
1769                 CWARN("no group lock held\n");
1770                 RETURN(-EINVAL);
1771         }
1772
1773         LASSERT(fd->fd_grouplock.lg_lock != NULL);
1774
1775         if (fd->fd_grouplock.lg_gid != arg) {
1776                 CWARN("group lock %lu doesn't match current id %lu\n",
1777                       arg, fd->fd_grouplock.lg_gid);
1778                 spin_unlock(&lli->lli_lock);
1779                 RETURN(-EINVAL);
1780         }
1781
1782         grouplock = fd->fd_grouplock;
1783         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1784         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1785         spin_unlock(&lli->lli_lock);
1786
1787         cl_put_grouplock(&grouplock);
1788         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1789         RETURN(0);
1790 }
1791
1792 /**
1793  * Close inode open handle
1794  *
1795  * \param dentry [in]     dentry which contains the inode
1796  * \param it     [in,out] intent which contains open info and result
1797  *
1798  * \retval 0     success
1799  * \retval <0    failure
1800  */
1801 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1802 {
1803         struct inode *inode = dentry->d_inode;
1804         struct obd_client_handle *och;
1805         int rc;
1806         ENTRY;
1807
1808         LASSERT(inode);
1809
1810         /* Root ? Do nothing. */
1811         if (dentry->d_inode->i_sb->s_root == dentry)
1812                 RETURN(0);
1813
1814         /* No open handle to close? Move away */
1815         if (!it_disposition(it, DISP_OPEN_OPEN))
1816                 RETURN(0);
1817
1818         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1819
1820         OBD_ALLOC(och, sizeof(*och));
1821         if (!och)
1822                 GOTO(out, rc = -ENOMEM);
1823
1824         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1825
1826         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1827 out:
1828         /* this one is in place of ll_file_open */
1829         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1830                 ptlrpc_req_finished(it->it_request);
1831                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1832         }
1833         RETURN(rc);
1834 }
1835
1836 /**
1837  * Get size for inode for which FIEMAP mapping is requested.
1838  * Make the FIEMAP get_info call and returns the result.
1839  * \param fiemap        kernel buffer to hold extens
1840  * \param num_bytes     kernel buffer size
1841  */
1842 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1843                         size_t num_bytes)
1844 {
1845         struct lu_env                   *env;
1846         __u16                           refcheck;
1847         int                             rc = 0;
1848         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
1849         ENTRY;
1850
1851         /* Checks for fiemap flags */
1852         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1853                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1854                 return -EBADR;
1855         }
1856
1857         /* Check for FIEMAP_FLAG_SYNC */
1858         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1859                 rc = filemap_fdatawrite(inode->i_mapping);
1860                 if (rc)
1861                         return rc;
1862         }
1863
1864         env = cl_env_get(&refcheck);
1865         if (IS_ERR(env))
1866                 RETURN(PTR_ERR(env));
1867
1868         if (i_size_read(inode) == 0) {
1869                 rc = ll_glimpse_size(inode);
1870                 if (rc)
1871                         GOTO(out, rc);
1872         }
1873
1874         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1875         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1876         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1877
1878         /* If filesize is 0, then there would be no objects for mapping */
1879         if (fmkey.lfik_oa.o_size == 0) {
1880                 fiemap->fm_mapped_extents = 0;
1881                 GOTO(out, rc = 0);
1882         }
1883
1884         fmkey.lfik_fiemap = *fiemap;
1885
1886         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1887                               &fmkey, fiemap, &num_bytes);
1888 out:
1889         cl_env_put(env, &refcheck);
1890         RETURN(rc);
1891 }
1892
1893 int ll_fid2path(struct inode *inode, void __user *arg)
1894 {
1895         struct obd_export       *exp = ll_i2mdexp(inode);
1896         const struct getinfo_fid2path __user *gfin = arg;
1897         __u32                    pathlen;
1898         struct getinfo_fid2path *gfout;
1899         size_t                   outsize;
1900         int                      rc;
1901
1902         ENTRY;
1903
1904         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1905             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1906                 RETURN(-EPERM);
1907
1908         /* Only need to get the buflen */
1909         if (get_user(pathlen, &gfin->gf_pathlen))
1910                 RETURN(-EFAULT);
1911
1912         if (pathlen > PATH_MAX)
1913                 RETURN(-EINVAL);
1914
1915         outsize = sizeof(*gfout) + pathlen;
1916         OBD_ALLOC(gfout, outsize);
1917         if (gfout == NULL)
1918                 RETURN(-ENOMEM);
1919
1920         if (copy_from_user(gfout, arg, sizeof(*gfout)))
1921                 GOTO(gf_free, rc = -EFAULT);
1922         /* append root FID after gfout to let MDT know the root FID so that it
1923          * can lookup the correct path, this is mainly for fileset.
1924          * old server without fileset mount support will ignore this. */
1925         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1926
1927         /* Call mdc_iocontrol */
1928         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1929         if (rc != 0)
1930                 GOTO(gf_free, rc);
1931
1932         if (copy_to_user(arg, gfout, outsize))
1933                 rc = -EFAULT;
1934
1935 gf_free:
1936         OBD_FREE(gfout, outsize);
1937         RETURN(rc);
1938 }
1939
1940 /*
1941  * Read the data_version for inode.
1942  *
1943  * This value is computed using stripe object version on OST.
1944  * Version is computed using server side locking.
1945  *
1946  * @param flags if do sync on the OST side;
1947  *              0: no sync
1948  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1949  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1950  */
1951 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1952 {
1953         struct cl_object *obj = ll_i2info(inode)->lli_clob;
1954         struct lu_env *env;
1955         struct cl_io *io;
1956         __u16  refcheck;
1957         int result;
1958
1959         ENTRY;
1960
1961         /* If no file object initialized, we consider its version is 0. */
1962         if (obj == NULL) {
1963                 *data_version = 0;
1964                 RETURN(0);
1965         }
1966
1967         env = cl_env_get(&refcheck);
1968         if (IS_ERR(env))
1969                 RETURN(PTR_ERR(env));
1970
1971         io = vvp_env_thread_io(env);
1972         io->ci_obj = obj;
1973         io->u.ci_data_version.dv_data_version = 0;
1974         io->u.ci_data_version.dv_flags = flags;
1975
1976 restart:
1977         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1978                 result = cl_io_loop(env, io);
1979         else
1980                 result = io->ci_result;
1981
1982         *data_version = io->u.ci_data_version.dv_data_version;
1983
1984         cl_io_fini(env, io);
1985
1986         if (unlikely(io->ci_need_restart))
1987                 goto restart;
1988
1989         cl_env_put(env, &refcheck);
1990
1991         RETURN(result);
1992 }
1993
1994 /*
1995  * Trigger a HSM release request for the provided inode.
1996  */
1997 int ll_hsm_release(struct inode *inode)
1998 {
1999         struct lu_env *env;
2000         struct obd_client_handle *och = NULL;
2001         __u64 data_version = 0;
2002         int rc;
2003         __u16 refcheck;
2004         ENTRY;
2005
2006         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2007                ll_get_fsname(inode->i_sb, NULL, 0),
2008                PFID(&ll_i2info(inode)->lli_fid));
2009
2010         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2011         if (IS_ERR(och))
2012                 GOTO(out, rc = PTR_ERR(och));
2013
2014         /* Grab latest data_version and [am]time values */
2015         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2016         if (rc != 0)
2017                 GOTO(out, rc);
2018
2019         env = cl_env_get(&refcheck);
2020         if (IS_ERR(env))
2021                 GOTO(out, rc = PTR_ERR(env));
2022
2023         ll_merge_attr(env, inode);
2024         cl_env_put(env, &refcheck);
2025
2026         /* Release the file.
2027          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2028          * we still need it to pack l_remote_handle to MDT. */
2029         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2030                                        &data_version);
2031         och = NULL;
2032
2033         EXIT;
2034 out:
2035         if (och != NULL && !IS_ERR(och)) /* close the file */
2036                 ll_lease_close(och, inode, NULL);
2037
2038         return rc;
2039 }
2040
2041 struct ll_swap_stack {
2042         __u64                    dv1;
2043         __u64                    dv2;
2044         struct inode            *inode1;
2045         struct inode            *inode2;
2046         bool                     check_dv1;
2047         bool                     check_dv2;
2048 };
2049
2050 static int ll_swap_layouts(struct file *file1, struct file *file2,
2051                            struct lustre_swap_layouts *lsl)
2052 {
2053         struct mdc_swap_layouts  msl;
2054         struct md_op_data       *op_data;
2055         __u32                    gid;
2056         __u64                    dv;
2057         struct ll_swap_stack    *llss = NULL;
2058         int                      rc;
2059
2060         OBD_ALLOC_PTR(llss);
2061         if (llss == NULL)
2062                 RETURN(-ENOMEM);
2063
2064         llss->inode1 = file_inode(file1);
2065         llss->inode2 = file_inode(file2);
2066
2067         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2068         if (rc < 0)
2069                 GOTO(free, rc);
2070
2071         /* we use 2 bool because it is easier to swap than 2 bits */
2072         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2073                 llss->check_dv1 = true;
2074
2075         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2076                 llss->check_dv2 = true;
2077
2078         /* we cannot use lsl->sl_dvX directly because we may swap them */
2079         llss->dv1 = lsl->sl_dv1;
2080         llss->dv2 = lsl->sl_dv2;
2081
2082         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2083         if (rc == 0) /* same file, done! */
2084                 GOTO(free, rc);
2085
2086         if (rc < 0) { /* sequentialize it */
2087                 swap(llss->inode1, llss->inode2);
2088                 swap(file1, file2);
2089                 swap(llss->dv1, llss->dv2);
2090                 swap(llss->check_dv1, llss->check_dv2);
2091         }
2092
2093         gid = lsl->sl_gid;
2094         if (gid != 0) { /* application asks to flush dirty cache */
2095                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2096                 if (rc < 0)
2097                         GOTO(free, rc);
2098
2099                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2100                 if (rc < 0) {
2101                         ll_put_grouplock(llss->inode1, file1, gid);
2102                         GOTO(free, rc);
2103                 }
2104         }
2105
2106         /* ultimate check, before swaping the layouts we check if
2107          * dataversion has changed (if requested) */
2108         if (llss->check_dv1) {
2109                 rc = ll_data_version(llss->inode1, &dv, 0);
2110                 if (rc)
2111                         GOTO(putgl, rc);
2112                 if (dv != llss->dv1)
2113                         GOTO(putgl, rc = -EAGAIN);
2114         }
2115
2116         if (llss->check_dv2) {
2117                 rc = ll_data_version(llss->inode2, &dv, 0);
2118                 if (rc)
2119                         GOTO(putgl, rc);
2120                 if (dv != llss->dv2)
2121                         GOTO(putgl, rc = -EAGAIN);
2122         }
2123
2124         /* struct md_op_data is used to send the swap args to the mdt
2125          * only flags is missing, so we use struct mdc_swap_layouts
2126          * through the md_op_data->op_data */
2127         /* flags from user space have to be converted before they are send to
2128          * server, no flag is sent today, they are only used on the client */
2129         msl.msl_flags = 0;
2130         rc = -ENOMEM;
2131         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2132                                      0, LUSTRE_OPC_ANY, &msl);
2133         if (IS_ERR(op_data))
2134                 GOTO(free, rc = PTR_ERR(op_data));
2135
2136         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2137                            sizeof(*op_data), op_data, NULL);
2138         ll_finish_md_op_data(op_data);
2139
2140         if (rc < 0)
2141                 GOTO(putgl, rc);
2142
2143 putgl:
2144         if (gid != 0) {
2145                 ll_put_grouplock(llss->inode2, file2, gid);
2146                 ll_put_grouplock(llss->inode1, file1, gid);
2147         }
2148
2149 free:
2150         if (llss != NULL)
2151                 OBD_FREE_PTR(llss);
2152
2153         RETURN(rc);
2154 }
2155
2156 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2157 {
2158         struct md_op_data       *op_data;
2159         int                      rc;
2160         ENTRY;
2161
2162         /* Detect out-of range masks */
2163         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2164                 RETURN(-EINVAL);
2165
2166         /* Non-root users are forbidden to set or clear flags which are
2167          * NOT defined in HSM_USER_MASK. */
2168         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2169             !cfs_capable(CFS_CAP_SYS_ADMIN))
2170                 RETURN(-EPERM);
2171
2172         /* Detect out-of range archive id */
2173         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2174             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2175                 RETURN(-EINVAL);
2176
2177         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2178                                      LUSTRE_OPC_ANY, hss);
2179         if (IS_ERR(op_data))
2180                 RETURN(PTR_ERR(op_data));
2181
2182         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2183                            sizeof(*op_data), op_data, NULL);
2184
2185         ll_finish_md_op_data(op_data);
2186
2187         RETURN(rc);
2188 }
2189
2190 static int ll_hsm_import(struct inode *inode, struct file *file,
2191                          struct hsm_user_import *hui)
2192 {
2193         struct hsm_state_set    *hss = NULL;
2194         struct iattr            *attr = NULL;
2195         int                      rc;
2196         ENTRY;
2197
2198         if (!S_ISREG(inode->i_mode))
2199                 RETURN(-EINVAL);
2200
2201         /* set HSM flags */
2202         OBD_ALLOC_PTR(hss);
2203         if (hss == NULL)
2204                 GOTO(out, rc = -ENOMEM);
2205
2206         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2207         hss->hss_archive_id = hui->hui_archive_id;
2208         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2209         rc = ll_hsm_state_set(inode, hss);
2210         if (rc != 0)
2211                 GOTO(out, rc);
2212
2213         OBD_ALLOC_PTR(attr);
2214         if (attr == NULL)
2215                 GOTO(out, rc = -ENOMEM);
2216
2217         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2218         attr->ia_mode |= S_IFREG;
2219         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2220         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2221         attr->ia_size = hui->hui_size;
2222         attr->ia_mtime.tv_sec = hui->hui_mtime;
2223         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2224         attr->ia_atime.tv_sec = hui->hui_atime;
2225         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2226
2227         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2228                          ATTR_UID | ATTR_GID |
2229                          ATTR_MTIME | ATTR_MTIME_SET |
2230                          ATTR_ATIME | ATTR_ATIME_SET;
2231
2232         inode_lock(inode);
2233
2234         rc = ll_setattr_raw(file_dentry(file), attr, true);
2235         if (rc == -ENODATA)
2236                 rc = 0;
2237
2238         inode_unlock(inode);
2239
2240 out:
2241         if (hss != NULL)
2242                 OBD_FREE_PTR(hss);
2243
2244         if (attr != NULL)
2245                 OBD_FREE_PTR(attr);
2246
2247         RETURN(rc);
2248 }
2249
2250 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2251 {
2252         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2253                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2254 }
2255
2256 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2257 {
2258         struct inode *inode = file_inode(file);
2259         struct iattr ia = {
2260                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2261                             ATTR_MTIME | ATTR_MTIME_SET |
2262                             ATTR_CTIME | ATTR_CTIME_SET,
2263                 .ia_atime = {
2264                         .tv_sec = lfu->lfu_atime_sec,
2265                         .tv_nsec = lfu->lfu_atime_nsec,
2266                 },
2267                 .ia_mtime = {
2268                         .tv_sec = lfu->lfu_mtime_sec,
2269                         .tv_nsec = lfu->lfu_mtime_nsec,
2270                 },
2271                 .ia_ctime = {
2272                         .tv_sec = lfu->lfu_ctime_sec,
2273                         .tv_nsec = lfu->lfu_ctime_nsec,
2274                 },
2275         };
2276         int rc;
2277         ENTRY;
2278
2279         if (!capable(CAP_SYS_ADMIN))
2280                 RETURN(-EPERM);
2281
2282         if (!S_ISREG(inode->i_mode))
2283                 RETURN(-EINVAL);
2284
2285         inode_lock(inode);
2286         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2287         inode_unlock(inode);
2288
2289         RETURN(rc);
2290 }
2291
2292 /*
2293  * Give file access advices
2294  *
2295  * The ladvise interface is similar to Linux fadvise() system call, except it
2296  * forwards the advices directly from Lustre client to server. The server side
2297  * codes will apply appropriate read-ahead and caching techniques for the
2298  * corresponding files.
2299  *
2300  * A typical workload for ladvise is e.g. a bunch of different clients are
2301  * doing small random reads of a file, so prefetching pages into OSS cache
2302  * with big linear reads before the random IO is a net benefit. Fetching
2303  * all that data into each client cache with fadvise() may not be, due to
2304  * much more data being sent to the client.
2305  */
2306 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2307                       struct llapi_lu_ladvise *ladvise)
2308 {
2309         struct lu_env *env;
2310         struct cl_io *io;
2311         struct cl_ladvise_io *lio;
2312         int rc;
2313         __u16 refcheck;
2314         ENTRY;
2315
2316         env = cl_env_get(&refcheck);
2317         if (IS_ERR(env))
2318                 RETURN(PTR_ERR(env));
2319
2320         io = vvp_env_thread_io(env);
2321         io->ci_obj = ll_i2info(inode)->lli_clob;
2322
2323         /* initialize parameters for ladvise */
2324         lio = &io->u.ci_ladvise;
2325         lio->li_start = ladvise->lla_start;
2326         lio->li_end = ladvise->lla_end;
2327         lio->li_fid = ll_inode2fid(inode);
2328         lio->li_advice = ladvise->lla_advice;
2329         lio->li_flags = flags;
2330
2331         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2332                 rc = cl_io_loop(env, io);
2333         else
2334                 rc = io->ci_result;
2335
2336         cl_io_fini(env, io);
2337         cl_env_put(env, &refcheck);
2338         RETURN(rc);
2339 }
2340
2341 static long
2342 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2343 {
2344         struct inode            *inode = file_inode(file);
2345         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2346         int                      flags, rc;
2347         ENTRY;
2348
2349         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2350                PFID(ll_inode2fid(inode)), inode, cmd);
2351         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2352
2353         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2354         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2355                 RETURN(-ENOTTY);
2356
2357         switch(cmd) {
2358         case LL_IOC_GETFLAGS:
2359                 /* Get the current value of the file flags */
2360                 return put_user(fd->fd_flags, (int __user *)arg);
2361         case LL_IOC_SETFLAGS:
2362         case LL_IOC_CLRFLAGS:
2363                 /* Set or clear specific file flags */
2364                 /* XXX This probably needs checks to ensure the flags are
2365                  *     not abused, and to handle any flag side effects.
2366                  */
2367                 if (get_user(flags, (int __user *) arg))
2368                         RETURN(-EFAULT);
2369
2370                 if (cmd == LL_IOC_SETFLAGS) {
2371                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2372                             !(file->f_flags & O_DIRECT)) {
2373                                 CERROR("%s: unable to disable locking on "
2374                                        "non-O_DIRECT file\n", current->comm);
2375                                 RETURN(-EINVAL);
2376                         }
2377
2378                         fd->fd_flags |= flags;
2379                 } else {
2380                         fd->fd_flags &= ~flags;
2381                 }
2382                 RETURN(0);
2383         case LL_IOC_LOV_SETSTRIPE:
2384                 RETURN(ll_lov_setstripe(inode, file, arg));
2385         case LL_IOC_LOV_SETEA:
2386                 RETURN(ll_lov_setea(inode, file, arg));
2387         case LL_IOC_LOV_SWAP_LAYOUTS: {
2388                 struct file *file2;
2389                 struct lustre_swap_layouts lsl;
2390
2391                 if (copy_from_user(&lsl, (char __user *)arg,
2392                                        sizeof(struct lustre_swap_layouts)))
2393                         RETURN(-EFAULT);
2394
2395                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2396                         RETURN(-EPERM);
2397
2398                 file2 = fget(lsl.sl_fd);
2399                 if (file2 == NULL)
2400                         RETURN(-EBADF);
2401
2402                 /* O_WRONLY or O_RDWR */
2403                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2404                         GOTO(out, rc = -EPERM);
2405
2406                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2407                         struct inode                    *inode2;
2408                         struct ll_inode_info            *lli;
2409                         struct obd_client_handle        *och = NULL;
2410
2411                         if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2412                                 GOTO(out, rc = -EINVAL);
2413
2414                         lli = ll_i2info(inode);
2415                         mutex_lock(&lli->lli_och_mutex);
2416                         if (fd->fd_lease_och != NULL) {
2417                                 och = fd->fd_lease_och;
2418                                 fd->fd_lease_och = NULL;
2419                         }
2420                         mutex_unlock(&lli->lli_och_mutex);
2421                         if (och == NULL)
2422                                 GOTO(out, rc = -ENOLCK);
2423                         inode2 = file_inode(file2);
2424                         rc = ll_swap_layouts_close(och, inode, inode2);
2425                 } else {
2426                         rc = ll_swap_layouts(file, file2, &lsl);
2427                 }
2428 out:
2429                 fput(file2);
2430                 RETURN(rc);
2431         }
2432         case LL_IOC_LOV_GETSTRIPE:
2433                 RETURN(ll_file_getstripe(inode,
2434                                          (struct lov_user_md __user *)arg));
2435         case FSFILT_IOC_GETFLAGS:
2436         case FSFILT_IOC_SETFLAGS:
2437                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2438         case FSFILT_IOC_GETVERSION_OLD:
2439         case FSFILT_IOC_GETVERSION:
2440                 RETURN(put_user(inode->i_generation, (int __user *)arg));
2441         case LL_IOC_GROUP_LOCK:
2442                 RETURN(ll_get_grouplock(inode, file, arg));
2443         case LL_IOC_GROUP_UNLOCK:
2444                 RETURN(ll_put_grouplock(inode, file, arg));
2445         case IOC_OBD_STATFS:
2446                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2447
2448         /* We need to special case any other ioctls we want to handle,
2449          * to send them to the MDS/OST as appropriate and to properly
2450          * network encode the arg field.
2451         case FSFILT_IOC_SETVERSION_OLD:
2452         case FSFILT_IOC_SETVERSION:
2453         */
2454         case LL_IOC_FLUSHCTX:
2455                 RETURN(ll_flush_ctx(inode));
2456         case LL_IOC_PATH2FID: {
2457                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2458                                  sizeof(struct lu_fid)))
2459                         RETURN(-EFAULT);
2460
2461                 RETURN(0);
2462         }
2463         case LL_IOC_GETPARENT:
2464                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2465
2466         case OBD_IOC_FID2PATH:
2467                 RETURN(ll_fid2path(inode, (void __user *)arg));
2468         case LL_IOC_DATA_VERSION: {
2469                 struct ioc_data_version idv;
2470                 int rc;
2471
2472                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2473                         RETURN(-EFAULT);
2474
2475                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2476                 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2477
2478                 if (rc == 0 &&
2479                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2480                         RETURN(-EFAULT);
2481
2482                 RETURN(rc);
2483         }
2484
2485         case LL_IOC_GET_MDTIDX: {
2486                 int mdtidx;
2487
2488                 mdtidx = ll_get_mdt_idx(inode);
2489                 if (mdtidx < 0)
2490                         RETURN(mdtidx);
2491
2492                 if (put_user((int)mdtidx, (int __user *)arg))
2493                         RETURN(-EFAULT);
2494
2495                 RETURN(0);
2496         }
2497         case OBD_IOC_GETDTNAME:
2498         case OBD_IOC_GETMDNAME:
2499                 RETURN(ll_get_obd_name(inode, cmd, arg));
2500         case LL_IOC_HSM_STATE_GET: {
2501                 struct md_op_data       *op_data;
2502                 struct hsm_user_state   *hus;
2503                 int                      rc;
2504
2505                 OBD_ALLOC_PTR(hus);
2506                 if (hus == NULL)
2507                         RETURN(-ENOMEM);
2508
2509                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2510                                              LUSTRE_OPC_ANY, hus);
2511                 if (IS_ERR(op_data)) {
2512                         OBD_FREE_PTR(hus);
2513                         RETURN(PTR_ERR(op_data));
2514                 }
2515
2516                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2517                                    op_data, NULL);
2518
2519                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2520                         rc = -EFAULT;
2521
2522                 ll_finish_md_op_data(op_data);
2523                 OBD_FREE_PTR(hus);
2524                 RETURN(rc);
2525         }
2526         case LL_IOC_HSM_STATE_SET: {
2527                 struct hsm_state_set    *hss;
2528                 int                      rc;
2529
2530                 OBD_ALLOC_PTR(hss);
2531                 if (hss == NULL)
2532                         RETURN(-ENOMEM);
2533
2534                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2535                         OBD_FREE_PTR(hss);
2536                         RETURN(-EFAULT);
2537                 }
2538
2539                 rc = ll_hsm_state_set(inode, hss);
2540
2541                 OBD_FREE_PTR(hss);
2542                 RETURN(rc);
2543         }
2544         case LL_IOC_HSM_ACTION: {
2545                 struct md_op_data               *op_data;
2546                 struct hsm_current_action       *hca;
2547                 int                              rc;
2548
2549                 OBD_ALLOC_PTR(hca);
2550                 if (hca == NULL)
2551                         RETURN(-ENOMEM);
2552
2553                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2554                                              LUSTRE_OPC_ANY, hca);
2555                 if (IS_ERR(op_data)) {
2556                         OBD_FREE_PTR(hca);
2557                         RETURN(PTR_ERR(op_data));
2558                 }
2559
2560                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2561                                    op_data, NULL);
2562
2563                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2564                         rc = -EFAULT;
2565
2566                 ll_finish_md_op_data(op_data);
2567                 OBD_FREE_PTR(hca);
2568                 RETURN(rc);
2569         }
2570         case LL_IOC_SET_LEASE: {
2571                 struct ll_inode_info *lli = ll_i2info(inode);
2572                 struct obd_client_handle *och = NULL;
2573                 bool lease_broken;
2574                 fmode_t fmode;
2575
2576                 switch (arg) {
2577                 case LL_LEASE_WRLCK:
2578                         if (!(file->f_mode & FMODE_WRITE))
2579                                 RETURN(-EPERM);
2580                         fmode = FMODE_WRITE;
2581                         break;
2582                 case LL_LEASE_RDLCK:
2583                         if (!(file->f_mode & FMODE_READ))
2584                                 RETURN(-EPERM);
2585                         fmode = FMODE_READ;
2586                         break;
2587                 case LL_LEASE_UNLCK:
2588                         mutex_lock(&lli->lli_och_mutex);
2589                         if (fd->fd_lease_och != NULL) {
2590                                 och = fd->fd_lease_och;
2591                                 fd->fd_lease_och = NULL;
2592                         }
2593                         mutex_unlock(&lli->lli_och_mutex);
2594
2595                         if (och == NULL)
2596                                 RETURN(-ENOLCK);
2597
2598                         fmode = och->och_flags;
2599                         rc = ll_lease_close(och, inode, &lease_broken);
2600                         if (rc < 0)
2601                                 RETURN(rc);
2602
2603                         rc = ll_lease_och_release(inode, file);
2604                         if (rc < 0)
2605                                 RETURN(rc);
2606
2607                         if (lease_broken)
2608                                 fmode = 0;
2609
2610                         RETURN(ll_lease_type_from_fmode(fmode));
2611                 default:
2612                         RETURN(-EINVAL);
2613                 }
2614
2615                 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2616
2617                 /* apply for lease */
2618                 och = ll_lease_open(inode, file, fmode, 0);
2619                 if (IS_ERR(och))
2620                         RETURN(PTR_ERR(och));
2621
2622                 rc = 0;
2623                 mutex_lock(&lli->lli_och_mutex);
2624                 if (fd->fd_lease_och == NULL) {
2625                         fd->fd_lease_och = och;
2626                         och = NULL;
2627                 }
2628                 mutex_unlock(&lli->lli_och_mutex);
2629                 if (och != NULL) {
2630                         /* impossible now that only excl is supported for now */
2631                         ll_lease_close(och, inode, &lease_broken);
2632                         rc = -EBUSY;
2633                 }
2634                 RETURN(rc);
2635         }
2636         case LL_IOC_GET_LEASE: {
2637                 struct ll_inode_info *lli = ll_i2info(inode);
2638                 struct ldlm_lock *lock = NULL;
2639                 fmode_t fmode = 0;
2640
2641                 mutex_lock(&lli->lli_och_mutex);
2642                 if (fd->fd_lease_och != NULL) {
2643                         struct obd_client_handle *och = fd->fd_lease_och;
2644
2645                         lock = ldlm_handle2lock(&och->och_lease_handle);
2646                         if (lock != NULL) {
2647                                 lock_res_and_lock(lock);
2648                                 if (!ldlm_is_cancel(lock))
2649                                         fmode = och->och_flags;
2650
2651                                 unlock_res_and_lock(lock);
2652                                 LDLM_LOCK_PUT(lock);
2653                         }
2654                 }
2655                 mutex_unlock(&lli->lli_och_mutex);
2656
2657                 RETURN(ll_lease_type_from_fmode(fmode));
2658         }
2659         case LL_IOC_HSM_IMPORT: {
2660                 struct hsm_user_import *hui;
2661
2662                 OBD_ALLOC_PTR(hui);
2663                 if (hui == NULL)
2664                         RETURN(-ENOMEM);
2665
2666                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2667                         OBD_FREE_PTR(hui);
2668                         RETURN(-EFAULT);
2669                 }
2670
2671                 rc = ll_hsm_import(inode, file, hui);
2672
2673                 OBD_FREE_PTR(hui);
2674                 RETURN(rc);
2675         }
2676         case LL_IOC_FUTIMES_3: {
2677                 struct ll_futimes_3 lfu;
2678
2679                 if (copy_from_user(&lfu,
2680                                    (const struct ll_futimes_3 __user *)arg,
2681                                    sizeof(lfu)))
2682                         RETURN(-EFAULT);
2683
2684                 RETURN(ll_file_futimes_3(file, &lfu));
2685         }
2686         case LL_IOC_LADVISE: {
2687                 struct llapi_ladvise_hdr *ladvise_hdr;
2688                 int i;
2689                 int num_advise;
2690                 int alloc_size = sizeof(*ladvise_hdr);
2691
2692                 rc = 0;
2693                 OBD_ALLOC_PTR(ladvise_hdr);
2694                 if (ladvise_hdr == NULL)
2695                         RETURN(-ENOMEM);
2696
2697                 if (copy_from_user(ladvise_hdr,
2698                                    (const struct llapi_ladvise_hdr __user *)arg,
2699                                    alloc_size))
2700                         GOTO(out_ladvise, rc = -EFAULT);
2701
2702                 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2703                     ladvise_hdr->lah_count < 1)
2704                         GOTO(out_ladvise, rc = -EINVAL);
2705
2706                 num_advise = ladvise_hdr->lah_count;
2707                 if (num_advise >= LAH_COUNT_MAX)
2708                         GOTO(out_ladvise, rc = -EFBIG);
2709
2710                 OBD_FREE_PTR(ladvise_hdr);
2711                 alloc_size = offsetof(typeof(*ladvise_hdr),
2712                                       lah_advise[num_advise]);
2713                 OBD_ALLOC(ladvise_hdr, alloc_size);
2714                 if (ladvise_hdr == NULL)
2715                         RETURN(-ENOMEM);
2716
2717                 /*
2718                  * TODO: submit multiple advices to one server in a single RPC
2719                  */
2720                 if (copy_from_user(ladvise_hdr,
2721                                    (const struct llapi_ladvise_hdr __user *)arg,
2722                                    alloc_size))
2723                         GOTO(out_ladvise, rc = -EFAULT);
2724
2725                 for (i = 0; i < num_advise; i++) {
2726                         rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2727                                         &ladvise_hdr->lah_advise[i]);
2728                         if (rc)
2729                                 break;
2730                 }
2731
2732 out_ladvise:
2733                 OBD_FREE(ladvise_hdr, alloc_size);
2734                 RETURN(rc);
2735         }
2736         default: {
2737                 int err;
2738
2739                 if (LLIOC_STOP ==
2740                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2741                         RETURN(err);
2742
2743                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2744                                      (void __user *)arg));
2745         }
2746         }
2747 }
2748
2749 #ifndef HAVE_FILE_LLSEEK_SIZE
2750 static inline loff_t
2751 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2752 {
2753         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2754                 return -EINVAL;
2755         if (offset > maxsize)
2756                 return -EINVAL;
2757
2758         if (offset != file->f_pos) {
2759                 file->f_pos = offset;
2760                 file->f_version = 0;
2761         }
2762         return offset;
2763 }
2764
2765 static loff_t
2766 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2767                 loff_t maxsize, loff_t eof)
2768 {
2769         struct inode *inode = file_inode(file);
2770
2771         switch (origin) {
2772         case SEEK_END:
2773                 offset += eof;
2774                 break;
2775         case SEEK_CUR:
2776                 /*
2777                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2778                  * position-querying operation.  Avoid rewriting the "same"
2779                  * f_pos value back to the file because a concurrent read(),
2780                  * write() or lseek() might have altered it
2781                  */
2782                 if (offset == 0)
2783                         return file->f_pos;
2784                 /*
2785                  * f_lock protects against read/modify/write race with other
2786                  * SEEK_CURs. Note that parallel writes and reads behave
2787                  * like SEEK_SET.
2788                  */
2789                 inode_lock(inode);
2790                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2791                 inode_unlock(inode);
2792                 return offset;
2793         case SEEK_DATA:
2794                 /*
2795                  * In the generic case the entire file is data, so as long as
2796                  * offset isn't at the end of the file then the offset is data.
2797                  */
2798                 if (offset >= eof)
2799                         return -ENXIO;
2800                 break;
2801         case SEEK_HOLE:
2802                 /*
2803                  * There is a virtual hole at the end of the file, so as long as
2804                  * offset isn't i_size or larger, return i_size.
2805                  */
2806                 if (offset >= eof)
2807                         return -ENXIO;
2808                 offset = eof;
2809                 break;
2810         }
2811
2812         return llseek_execute(file, offset, maxsize);
2813 }
2814 #endif
2815
2816 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2817 {
2818         struct inode *inode = file_inode(file);
2819         loff_t retval, eof = 0;
2820
2821         ENTRY;
2822         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2823                            (origin == SEEK_CUR) ? file->f_pos : 0);
2824         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2825                PFID(ll_inode2fid(inode)), inode, retval, retval,
2826                origin);
2827         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2828
2829         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2830                 retval = ll_glimpse_size(inode);
2831                 if (retval != 0)
2832                         RETURN(retval);
2833                 eof = i_size_read(inode);
2834         }
2835
2836         retval = ll_generic_file_llseek_size(file, offset, origin,
2837                                           ll_file_maxbytes(inode), eof);
2838         RETURN(retval);
2839 }
2840
2841 static int ll_flush(struct file *file, fl_owner_t id)
2842 {
2843         struct inode *inode = file_inode(file);
2844         struct ll_inode_info *lli = ll_i2info(inode);
2845         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2846         int rc, err;
2847
2848         LASSERT(!S_ISDIR(inode->i_mode));
2849
2850         /* catch async errors that were recorded back when async writeback
2851          * failed for pages in this mapping. */
2852         rc = lli->lli_async_rc;
2853         lli->lli_async_rc = 0;
2854         if (lli->lli_clob != NULL) {
2855                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2856                 if (rc == 0)
2857                         rc = err;
2858         }
2859
2860         /* The application has been told write failure already.
2861          * Do not report failure again. */
2862         if (fd->fd_write_failed)
2863                 return 0;
2864         return rc ? -EIO : 0;
2865 }
2866
2867 /**
2868  * Called to make sure a portion of file has been written out.
2869  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2870  *
2871  * Return how many pages have been written.
2872  */
2873 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2874                        enum cl_fsync_mode mode, int ignore_layout)
2875 {
2876         struct lu_env *env;
2877         struct cl_io *io;
2878         struct cl_fsync_io *fio;
2879         int result;
2880         __u16 refcheck;
2881         ENTRY;
2882
2883         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2884             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2885                 RETURN(-EINVAL);
2886
2887         env = cl_env_get(&refcheck);
2888         if (IS_ERR(env))
2889                 RETURN(PTR_ERR(env));
2890
2891         io = vvp_env_thread_io(env);
2892         io->ci_obj = ll_i2info(inode)->lli_clob;
2893         io->ci_ignore_layout = ignore_layout;
2894
2895         /* initialize parameters for sync */
2896         fio = &io->u.ci_fsync;
2897         fio->fi_start = start;
2898         fio->fi_end = end;
2899         fio->fi_fid = ll_inode2fid(inode);
2900         fio->fi_mode = mode;
2901         fio->fi_nr_written = 0;
2902
2903         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2904                 result = cl_io_loop(env, io);
2905         else
2906                 result = io->ci_result;
2907         if (result == 0)
2908                 result = fio->fi_nr_written;
2909         cl_io_fini(env, io);
2910         cl_env_put(env, &refcheck);
2911
2912         RETURN(result);
2913 }
2914
2915 /*
2916  * When dentry is provided (the 'else' case), file_dentry() may be
2917  * null and dentry must be used directly rather than pulled from
2918  * file_dentry() as is done otherwise.
2919  */
2920
2921 #ifdef HAVE_FILE_FSYNC_4ARGS
2922 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2923 {
2924         struct dentry *dentry = file_dentry(file);
2925 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2926 int ll_fsync(struct file *file, int datasync)
2927 {
2928         struct dentry *dentry = file_dentry(file);
2929         loff_t start = 0;
2930         loff_t end = LLONG_MAX;
2931 #else
2932 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2933 {
2934         loff_t start = 0;
2935         loff_t end = LLONG_MAX;
2936 #endif
2937         struct inode *inode = dentry->d_inode;
2938         struct ll_inode_info *lli = ll_i2info(inode);
2939         struct ptlrpc_request *req;
2940         int rc, err;
2941         ENTRY;
2942
2943         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2944                PFID(ll_inode2fid(inode)), inode);
2945         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2946
2947 #ifdef HAVE_FILE_FSYNC_4ARGS
2948         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2949         inode_lock(inode);
2950 #else
2951         /* fsync's caller has already called _fdata{sync,write}, we want
2952          * that IO to finish before calling the osc and mdc sync methods */
2953         rc = filemap_fdatawait(inode->i_mapping);
2954 #endif
2955
2956         /* catch async errors that were recorded back when async writeback
2957          * failed for pages in this mapping. */
2958         if (!S_ISDIR(inode->i_mode)) {
2959                 err = lli->lli_async_rc;
2960                 lli->lli_async_rc = 0;
2961                 if (rc == 0)
2962                         rc = err;
2963                 if (lli->lli_clob != NULL) {
2964                         err = lov_read_and_clear_async_rc(lli->lli_clob);
2965                         if (rc == 0)
2966                                 rc = err;
2967                 }
2968         }
2969
2970         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2971         if (!rc)
2972                 rc = err;
2973         if (!err)
2974                 ptlrpc_req_finished(req);
2975
2976         if (S_ISREG(inode->i_mode)) {
2977                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2978
2979                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2980                 if (rc == 0 && err < 0)
2981                         rc = err;
2982                 if (rc < 0)
2983                         fd->fd_write_failed = true;
2984                 else
2985                         fd->fd_write_failed = false;
2986         }
2987
2988 #ifdef HAVE_FILE_FSYNC_4ARGS
2989         inode_unlock(inode);
2990 #endif
2991         RETURN(rc);
2992 }
2993
2994 static int
2995 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2996 {
2997         struct inode *inode = file_inode(file);
2998         struct ll_sb_info *sbi = ll_i2sbi(inode);
2999         struct ldlm_enqueue_info einfo = {
3000                 .ei_type        = LDLM_FLOCK,
3001                 .ei_cb_cp       = ldlm_flock_completion_ast,
3002                 .ei_cbdata      = file_lock,
3003         };
3004         struct md_op_data *op_data;
3005         struct lustre_handle lockh = { 0 };
3006         union ldlm_policy_data flock = { { 0 } };
3007         int fl_type = file_lock->fl_type;
3008         __u64 flags = 0;
3009         int rc;
3010         int rc2 = 0;
3011         ENTRY;
3012
3013         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3014                PFID(ll_inode2fid(inode)), file_lock);
3015
3016         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3017
3018         if (file_lock->fl_flags & FL_FLOCK) {
3019                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3020                 /* flocks are whole-file locks */
3021                 flock.l_flock.end = OFFSET_MAX;
3022                 /* For flocks owner is determined by the local file desctiptor*/
3023                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3024         } else if (file_lock->fl_flags & FL_POSIX) {
3025                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3026                 flock.l_flock.start = file_lock->fl_start;
3027                 flock.l_flock.end = file_lock->fl_end;
3028         } else {
3029                 RETURN(-EINVAL);
3030         }
3031         flock.l_flock.pid = file_lock->fl_pid;
3032
3033         /* Somewhat ugly workaround for svc lockd.
3034          * lockd installs custom fl_lmops->lm_compare_owner that checks
3035          * for the fl_owner to be the same (which it always is on local node
3036          * I guess between lockd processes) and then compares pid.
3037          * As such we assign pid to the owner field to make it all work,
3038          * conflict with normal locks is unlikely since pid space and
3039          * pointer space for current->files are not intersecting */
3040         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3041                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3042
3043         switch (fl_type) {
3044         case F_RDLCK:
3045                 einfo.ei_mode = LCK_PR;
3046                 break;
3047         case F_UNLCK:
3048                 /* An unlock request may or may not have any relation to
3049                  * existing locks so we may not be able to pass a lock handle
3050                  * via a normal ldlm_lock_cancel() request. The request may even
3051                  * unlock a byte range in the middle of an existing lock. In
3052                  * order to process an unlock request we need all of the same
3053                  * information that is given with a normal read or write record
3054                  * lock request. To avoid creating another ldlm unlock (cancel)
3055                  * message we'll treat a LCK_NL flock request as an unlock. */
3056                 einfo.ei_mode = LCK_NL;
3057                 break;
3058         case F_WRLCK:
3059                 einfo.ei_mode = LCK_PW;
3060                 break;
3061         default:
3062                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3063                 RETURN (-ENOTSUPP);
3064         }
3065
3066         switch (cmd) {
3067         case F_SETLKW:
3068 #ifdef F_SETLKW64
3069         case F_SETLKW64:
3070 #endif
3071                 flags = 0;
3072                 break;
3073         case F_SETLK:
3074 #ifdef F_SETLK64
3075         case F_SETLK64:
3076 #endif
3077                 flags = LDLM_FL_BLOCK_NOWAIT;
3078                 break;
3079         case F_GETLK:
3080 #ifdef F_GETLK64
3081         case F_GETLK64:
3082 #endif
3083                 flags = LDLM_FL_TEST_LOCK;
3084                 break;
3085         default:
3086                 CERROR("unknown fcntl lock command: %d\n", cmd);
3087                 RETURN (-EINVAL);
3088         }
3089
3090         /* Save the old mode so that if the mode in the lock changes we
3091          * can decrement the appropriate reader or writer refcount. */
3092         file_lock->fl_type = einfo.ei_mode;
3093
3094         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3095                                      LUSTRE_OPC_ANY, NULL);
3096         if (IS_ERR(op_data))
3097                 RETURN(PTR_ERR(op_data));
3098
3099         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3100                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3101                flock.l_flock.pid, flags, einfo.ei_mode,
3102                flock.l_flock.start, flock.l_flock.end);
3103
3104         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3105                         flags);
3106
3107         /* Restore the file lock type if not TEST lock. */
3108         if (!(flags & LDLM_FL_TEST_LOCK))
3109                 file_lock->fl_type = fl_type;
3110
3111 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3112         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3113             !(flags & LDLM_FL_TEST_LOCK))
3114                 rc2  = locks_lock_file_wait(file, file_lock);
3115 #else
3116         if ((file_lock->fl_flags & FL_FLOCK) &&
3117             (rc == 0 || file_lock->fl_type == F_UNLCK))
3118                 rc2  = flock_lock_file_wait(file, file_lock);
3119         if ((file_lock->fl_flags & FL_POSIX) &&
3120             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3121             !(flags & LDLM_FL_TEST_LOCK))
3122                 rc2  = posix_lock_file_wait(file, file_lock);
3123 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3124
3125         if (rc2 && file_lock->fl_type != F_UNLCK) {
3126                 einfo.ei_mode = LCK_NL;
3127                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3128                            &lockh, flags);
3129                 rc = rc2;
3130         }
3131
3132         ll_finish_md_op_data(op_data);
3133
3134         RETURN(rc);
3135 }
3136
3137 int ll_get_fid_by_name(struct inode *parent, const char *name,
3138                        int namelen, struct lu_fid *fid,
3139                        struct inode **inode)
3140 {
3141         struct md_op_data       *op_data = NULL;
3142         struct mdt_body         *body;
3143         struct ptlrpc_request   *req;
3144         int                     rc;
3145         ENTRY;
3146
3147         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3148                                      LUSTRE_OPC_ANY, NULL);
3149         if (IS_ERR(op_data))
3150                 RETURN(PTR_ERR(op_data));
3151
3152         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3153         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3154         ll_finish_md_op_data(op_data);
3155         if (rc < 0)
3156                 RETURN(rc);
3157
3158         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3159         if (body == NULL)
3160                 GOTO(out_req, rc = -EFAULT);
3161         if (fid != NULL)
3162                 *fid = body->mbo_fid1;
3163
3164         if (inode != NULL)
3165                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3166 out_req:
3167         ptlrpc_req_finished(req);
3168         RETURN(rc);
3169 }
3170
3171 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3172                const char *name, int namelen)
3173 {
3174         struct dentry         *dchild = NULL;
3175         struct inode          *child_inode = NULL;
3176         struct md_op_data     *op_data;
3177         struct ptlrpc_request *request = NULL;
3178         struct obd_client_handle *och = NULL;
3179         struct qstr           qstr;
3180         struct mdt_body         *body;
3181         int                    rc;
3182         __u64                   data_version = 0;
3183         ENTRY;
3184
3185         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3186                name, PFID(ll_inode2fid(parent)), mdtidx);
3187
3188         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3189                                      0, LUSTRE_OPC_ANY, NULL);
3190         if (IS_ERR(op_data))
3191                 RETURN(PTR_ERR(op_data));
3192
3193         /* Get child FID first */
3194         qstr.hash = full_name_hash(name, namelen);
3195         qstr.name = name;
3196         qstr.len = namelen;
3197         dchild = d_lookup(file_dentry(file), &qstr);
3198         if (dchild != NULL) {
3199                 if (dchild->d_inode != NULL)
3200                         child_inode = igrab(dchild->d_inode);
3201                 dput(dchild);
3202         }
3203
3204         if (child_inode == NULL) {
3205                 rc = ll_get_fid_by_name(parent, name, namelen,
3206                                         &op_data->op_fid3, &child_inode);
3207                 if (rc != 0)
3208                         GOTO(out_free, rc);
3209         }
3210
3211         if (child_inode == NULL)
3212                 GOTO(out_free, rc = -EINVAL);
3213
3214         /*
3215          * lfs migrate command needs to be blocked on the client
3216          * by checking the migrate FID against the FID of the
3217          * filesystem root.
3218          */
3219         if (child_inode == parent->i_sb->s_root->d_inode)
3220                 GOTO(out_iput, rc = -EINVAL);
3221
3222         inode_lock(child_inode);
3223         op_data->op_fid3 = *ll_inode2fid(child_inode);
3224         if (!fid_is_sane(&op_data->op_fid3)) {
3225                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3226                        ll_get_fsname(parent->i_sb, NULL, 0), name,
3227                        PFID(&op_data->op_fid3));
3228                 GOTO(out_unlock, rc = -EINVAL);
3229         }
3230
3231         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3232         if (rc < 0)
3233                 GOTO(out_unlock, rc);
3234
3235         if (rc == mdtidx) {
3236                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3237                        PFID(&op_data->op_fid3), mdtidx);
3238                 GOTO(out_unlock, rc = 0);
3239         }
3240 again:
3241         if (S_ISREG(child_inode->i_mode)) {
3242                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3243                 if (IS_ERR(och)) {
3244                         rc = PTR_ERR(och);
3245                         och = NULL;
3246                         GOTO(out_unlock, rc);
3247                 }
3248
3249                 rc = ll_data_version(child_inode, &data_version,
3250                                      LL_DV_WR_FLUSH);
3251                 if (rc != 0)
3252                         GOTO(out_close, rc);
3253
3254                 op_data->op_handle = och->och_fh;
3255                 op_data->op_data = och->och_mod;
3256                 op_data->op_data_version = data_version;
3257                 op_data->op_lease_handle = och->och_lease_handle;
3258                 op_data->op_bias |= MDS_RENAME_MIGRATE;
3259         }
3260
3261         op_data->op_mds = mdtidx;
3262         op_data->op_cli_flags = CLI_MIGRATE;
3263         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3264                        namelen, name, namelen, &request);
3265         if (rc == 0) {
3266                 LASSERT(request != NULL);
3267                 ll_update_times(request, parent);
3268
3269                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3270                 LASSERT(body != NULL);
3271
3272                 /* If the server does release layout lock, then we cleanup
3273                  * the client och here, otherwise release it in out_close: */
3274                 if (och != NULL &&
3275                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3276                         obd_mod_put(och->och_mod);
3277                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3278                                                   och);
3279                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3280                         OBD_FREE_PTR(och);
3281                         och = NULL;
3282                 }
3283         }
3284
3285         if (request != NULL) {
3286                 ptlrpc_req_finished(request);
3287                 request = NULL;
3288         }
3289
3290         /* Try again if the file layout has changed. */
3291         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3292                 goto again;
3293
3294 out_close:
3295         if (och != NULL) /* close the file */
3296                 ll_lease_close(och, child_inode, NULL);
3297         if (rc == 0)
3298                 clear_nlink(child_inode);
3299 out_unlock:
3300         inode_unlock(child_inode);
3301 out_iput:
3302         iput(child_inode);
3303 out_free:
3304         ll_finish_md_op_data(op_data);
3305         RETURN(rc);
3306 }
3307
3308 static int
3309 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3310 {
3311         ENTRY;
3312
3313         RETURN(-ENOSYS);
3314 }
3315
3316 /**
3317  * test if some locks matching bits and l_req_mode are acquired
3318  * - bits can be in different locks
3319  * - if found clear the common lock bits in *bits
3320  * - the bits not found, are kept in *bits
3321  * \param inode [IN]
3322  * \param bits [IN] searched lock bits [IN]
3323  * \param l_req_mode [IN] searched lock mode
3324  * \retval boolean, true iff all bits are found
3325  */
3326 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3327 {
3328         struct lustre_handle lockh;
3329         union ldlm_policy_data policy;
3330         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3331                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3332         struct lu_fid *fid;
3333         __u64 flags;
3334         int i;
3335         ENTRY;
3336
3337         if (!inode)
3338                RETURN(0);
3339
3340         fid = &ll_i2info(inode)->lli_fid;
3341         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3342                ldlm_lockname[mode]);
3343
3344         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3345         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3346                 policy.l_inodebits.bits = *bits & (1 << i);
3347                 if (policy.l_inodebits.bits == 0)
3348                         continue;
3349
3350                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3351                                   &policy, mode, &lockh)) {
3352                         struct ldlm_lock *lock;
3353
3354                         lock = ldlm_handle2lock(&lockh);
3355                         if (lock) {
3356                                 *bits &=
3357                                       ~(lock->l_policy_data.l_inodebits.bits);
3358                                 LDLM_LOCK_PUT(lock);
3359                         } else {
3360                                 *bits &= ~policy.l_inodebits.bits;
3361                         }
3362                 }
3363         }
3364         RETURN(*bits == 0);
3365 }
3366
3367 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3368                                struct lustre_handle *lockh, __u64 flags,
3369                                enum ldlm_mode mode)
3370 {
3371         union ldlm_policy_data policy = { .l_inodebits = { bits } };
3372         struct lu_fid *fid;
3373         enum ldlm_mode rc;
3374         ENTRY;
3375
3376         fid = &ll_i2info(inode)->lli_fid;
3377         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3378
3379         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3380                            fid, LDLM_IBITS, &policy, mode, lockh);
3381
3382         RETURN(rc);
3383 }
3384
3385 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3386 {
3387         /* Already unlinked. Just update nlink and return success */
3388         if (rc == -ENOENT) {
3389                 clear_nlink(inode);
3390                 /* If it is striped directory, and there is bad stripe
3391                  * Let's revalidate the dentry again, instead of returning
3392                  * error */
3393                 if (S_ISDIR(inode->i_mode) &&
3394                     ll_i2info(inode)->lli_lsm_md != NULL)
3395                         return 0;
3396
3397                 /* This path cannot be hit for regular files unless in
3398                  * case of obscure races, so no need to to validate
3399                  * size. */
3400                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3401                         return 0;
3402         } else if (rc != 0) {
3403                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3404                              "%s: revalidate FID "DFID" error: rc = %d\n",
3405                              ll_get_fsname(inode->i_sb, NULL, 0),
3406                              PFID(ll_inode2fid(inode)), rc);
3407         }
3408
3409         return rc;
3410 }
3411
3412 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3413 {
3414         struct inode *inode = dentry->d_inode;
3415         struct ptlrpc_request *req = NULL;
3416         struct obd_export *exp;
3417         int rc = 0;
3418         ENTRY;
3419
3420         LASSERT(inode != NULL);
3421
3422         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3423                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3424
3425         exp = ll_i2mdexp(inode);
3426
3427         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3428          *      But under CMD case, it caused some lock issues, should be fixed
3429          *      with new CMD ibits lock. See bug 12718 */
3430         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3431                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3432                 struct md_op_data *op_data;
3433
3434                 if (ibits == MDS_INODELOCK_LOOKUP)
3435                         oit.it_op = IT_LOOKUP;
3436
3437                 /* Call getattr by fid, so do not provide name at all. */
3438                 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3439                                              dentry->d_inode, NULL, 0, 0,
3440                                              LUSTRE_OPC_ANY, NULL);
3441                 if (IS_ERR(op_data))
3442                         RETURN(PTR_ERR(op_data));
3443
3444                 rc = md_intent_lock(exp, op_data, &oit, &req,
3445                                     &ll_md_blocking_ast, 0);
3446                 ll_finish_md_op_data(op_data);
3447                 if (rc < 0) {
3448                         rc = ll_inode_revalidate_fini(inode, rc);
3449                         GOTO (out, rc);
3450                 }
3451
3452                 rc = ll_revalidate_it_finish(req, &oit, dentry);
3453                 if (rc != 0) {
3454                         ll_intent_release(&oit);
3455                         GOTO(out, rc);
3456                 }
3457
3458                 /* Unlinked? Unhash dentry, so it is not picked up later by
3459                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3460                    here to preserve get_cwd functionality on 2.6.
3461                    Bug 10503 */
3462                 if (!dentry->d_inode->i_nlink) {
3463                         ll_lock_dcache(inode);
3464                         d_lustre_invalidate(dentry, 0);
3465                         ll_unlock_dcache(inode);
3466                 }
3467
3468                 ll_lookup_finish_locks(&oit, dentry);
3469         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3470                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3471                 u64 valid = OBD_MD_FLGETATTR;
3472                 struct md_op_data *op_data;
3473                 int ealen = 0;
3474
3475                 if (S_ISREG(inode->i_mode)) {
3476                         rc = ll_get_default_mdsize(sbi, &ealen);
3477                         if (rc)
3478                                 RETURN(rc);
3479                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3480                 }
3481
3482                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3483                                              0, ealen, LUSTRE_OPC_ANY,
3484                                              NULL);
3485                 if (IS_ERR(op_data))
3486                         RETURN(PTR_ERR(op_data));
3487
3488                 op_data->op_valid = valid;
3489                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3490                 ll_finish_md_op_data(op_data);
3491                 if (rc) {
3492                         rc = ll_inode_revalidate_fini(inode, rc);
3493                         RETURN(rc);
3494                 }
3495
3496                 rc = ll_prep_inode(&inode, req, NULL, NULL);
3497         }
3498 out:
3499         ptlrpc_req_finished(req);
3500         return rc;
3501 }
3502
3503 static int ll_merge_md_attr(struct inode *inode)
3504 {
3505         struct cl_attr attr = { 0 };
3506         int rc;
3507
3508         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3509         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3510                            &attr, ll_md_blocking_ast);
3511         if (rc != 0)
3512                 RETURN(rc);
3513
3514         set_nlink(inode, attr.cat_nlink);
3515         inode->i_blocks = attr.cat_blocks;
3516         i_size_write(inode, attr.cat_size);
3517
3518         ll_i2info(inode)->lli_atime = attr.cat_atime;
3519         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3520         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3521
3522         RETURN(0);
3523 }
3524
3525 static int
3526 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3527 {
3528         struct inode    *inode = dentry->d_inode;
3529         int              rc;
3530         ENTRY;
3531
3532         rc = __ll_inode_revalidate(dentry, ibits);
3533         if (rc != 0)
3534                 RETURN(rc);
3535
3536         /* if object isn't regular file, don't validate size */
3537         if (!S_ISREG(inode->i_mode)) {
3538                 if (S_ISDIR(inode->i_mode) &&
3539                     ll_i2info(inode)->lli_lsm_md != NULL) {
3540                         rc = ll_merge_md_attr(inode);
3541                         if (rc != 0)
3542                                 RETURN(rc);
3543                 }
3544
3545                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3546                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3547                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3548         } else {
3549                 /* In case of restore, the MDT has the right size and has
3550                  * already send it back without granting the layout lock,
3551                  * inode is up-to-date so glimpse is useless.
3552                  * Also to glimpse we need the layout, in case of a running
3553                  * restore the MDT holds the layout lock so the glimpse will
3554                  * block up to the end of restore (getattr will block)
3555                  */
3556                 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3557                         rc = ll_glimpse_size(inode);
3558         }
3559         RETURN(rc);
3560 }
3561
3562 static inline dev_t ll_compat_encode_dev(dev_t dev)
3563 {
3564         /* The compat_sys_*stat*() syscalls will fail unless the
3565          * device majors and minors are both less than 256. Note that
3566          * the value returned here will be passed through
3567          * old_encode_dev() in cp_compat_stat(). And so we are not
3568          * trying to return a valid compat (u16) device number, just
3569          * one that will pass the old_valid_dev() check. */
3570
3571         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3572 }
3573
3574 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3575 {
3576         struct inode *inode = de->d_inode;
3577         struct ll_sb_info *sbi = ll_i2sbi(inode);
3578         struct ll_inode_info *lli = ll_i2info(inode);
3579         int res = 0;
3580
3581         res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3582                                       MDS_INODELOCK_LOOKUP);
3583         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3584
3585         if (res)
3586                 return res;
3587
3588         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3589
3590         if (ll_need_32bit_api(sbi)) {
3591                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3592                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3593                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3594         } else {
3595                 stat->ino = inode->i_ino;
3596                 stat->dev = inode->i_sb->s_dev;
3597                 stat->rdev = inode->i_rdev;
3598         }
3599
3600         stat->mode = inode->i_mode;
3601         stat->uid = inode->i_uid;
3602         stat->gid = inode->i_gid;
3603         stat->atime = inode->i_atime;
3604         stat->mtime = inode->i_mtime;
3605         stat->ctime = inode->i_ctime;
3606         stat->blksize = 1 << inode->i_blkbits;
3607
3608         stat->nlink = inode->i_nlink;
3609         stat->size = i_size_read(inode);
3610         stat->blocks = inode->i_blocks;
3611
3612         return 0;
3613 }
3614
3615 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3616                      __u64 start, __u64 len)
3617 {
3618         int             rc;
3619         size_t          num_bytes;
3620         struct fiemap   *fiemap;
3621         unsigned int    extent_count = fieinfo->fi_extents_max;
3622
3623         num_bytes = sizeof(*fiemap) + (extent_count *
3624                                        sizeof(struct fiemap_extent));
3625         OBD_ALLOC_LARGE(fiemap, num_bytes);
3626
3627         if (fiemap == NULL)
3628                 RETURN(-ENOMEM);
3629
3630         fiemap->fm_flags = fieinfo->fi_flags;
3631         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3632         fiemap->fm_start = start;
3633         fiemap->fm_length = len;
3634         if (extent_count > 0 &&
3635             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3636                            sizeof(struct fiemap_extent)) != 0)
3637                 GOTO(out, rc = -EFAULT);
3638
3639         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3640
3641         fieinfo->fi_flags = fiemap->fm_flags;
3642         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3643         if (extent_count > 0 &&
3644             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3645                          fiemap->fm_mapped_extents *
3646                          sizeof(struct fiemap_extent)) != 0)
3647                 GOTO(out, rc = -EFAULT);
3648 out:
3649         OBD_FREE_LARGE(fiemap, num_bytes);
3650         return rc;
3651 }
3652
3653 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3654 {
3655         struct ll_inode_info *lli = ll_i2info(inode);
3656         struct posix_acl *acl = NULL;
3657         ENTRY;
3658
3659         spin_lock(&lli->lli_lock);
3660         /* VFS' acl_permission_check->check_acl will release the refcount */
3661         acl = posix_acl_dup(lli->lli_posix_acl);
3662         spin_unlock(&lli->lli_lock);
3663
3664         RETURN(acl);
3665 }
3666
3667 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3668 static int
3669 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3670 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3671 # else
3672 ll_check_acl(struct inode *inode, int mask)
3673 # endif
3674 {
3675 # ifdef CONFIG_FS_POSIX_ACL
3676         struct posix_acl *acl;
3677         int rc;
3678         ENTRY;
3679
3680 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
3681         if (flags & IPERM_FLAG_RCU)
3682                 return -ECHILD;
3683 #  endif
3684         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3685
3686         if (!acl)
3687                 RETURN(-EAGAIN);
3688
3689         rc = posix_acl_permission(inode, acl, mask);
3690         posix_acl_release(acl);
3691
3692         RETURN(rc);
3693 # else /* !CONFIG_FS_POSIX_ACL */
3694         return -EAGAIN;
3695 # endif /* CONFIG_FS_POSIX_ACL */
3696 }
3697 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3698
3699 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3700 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3701 #else
3702 # ifdef HAVE_INODE_PERMISION_2ARGS
3703 int ll_inode_permission(struct inode *inode, int mask)
3704 # else
3705 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3706 # endif
3707 #endif
3708 {
3709         int rc = 0;
3710         struct ll_sb_info *sbi;
3711         struct root_squash_info *squash;
3712         struct cred *cred = NULL;
3713         const struct cred *old_cred = NULL;
3714         cfs_cap_t cap;
3715         bool squash_id = false;
3716         ENTRY;
3717
3718 #ifdef MAY_NOT_BLOCK
3719         if (mask & MAY_NOT_BLOCK)
3720                 return -ECHILD;
3721 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3722         if (flags & IPERM_FLAG_RCU)
3723                 return -ECHILD;
3724 #endif
3725
3726        /* as root inode are NOT getting validated in lookup operation,
3727         * need to do it before permission check. */
3728
3729         if (inode == inode->i_sb->s_root->d_inode) {
3730                 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3731                                            MDS_INODELOCK_LOOKUP);
3732                 if (rc)
3733                         RETURN(rc);
3734         }
3735
3736         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3737                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3738
3739         /* squash fsuid/fsgid if needed */
3740         sbi = ll_i2sbi(inode);
3741         squash = &sbi->ll_squash;
3742         if (unlikely(squash->rsi_uid != 0 &&
3743                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3744                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3745                         squash_id = true;
3746         }
3747         if (squash_id) {
3748                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3749                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3750                        squash->rsi_uid, squash->rsi_gid);
3751
3752                 /* update current process's credentials
3753                  * and FS capability */
3754                 cred = prepare_creds();
3755                 if (cred == NULL)
3756                         RETURN(-ENOMEM);
3757
3758                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3759                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3760                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3761                         if ((1 << cap) & CFS_CAP_FS_MASK)
3762                                 cap_lower(cred->cap_effective, cap);
3763                 }
3764                 old_cred = override_creds(cred);
3765         }
3766
3767         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3768         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3769         /* restore current process's credentials and FS capability */
3770         if (squash_id) {
3771                 revert_creds(old_cred);
3772                 put_cred(cred);
3773         }
3774
3775         RETURN(rc);
3776 }
3777
3778 /* -o localflock - only provides locally consistent flock locks */
3779 struct file_operations ll_file_operations = {
3780 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3781 # ifdef HAVE_SYNC_READ_WRITE
3782         .read           = new_sync_read,
3783         .write          = new_sync_write,
3784 # endif
3785         .read_iter      = ll_file_read_iter,
3786         .write_iter     = ll_file_write_iter,
3787 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3788         .read           = ll_file_read,
3789         .aio_read       = ll_file_aio_read,
3790         .write          = ll_file_write,
3791         .aio_write      = ll_file_aio_write,
3792 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3793         .unlocked_ioctl = ll_file_ioctl,
3794         .open           = ll_file_open,
3795         .release        = ll_file_release,
3796         .mmap           = ll_file_mmap,
3797         .llseek         = ll_file_seek,
3798         .splice_read    = ll_file_splice_read,
3799         .fsync          = ll_fsync,
3800         .flush          = ll_flush
3801 };
3802
3803 struct file_operations ll_file_operations_flock = {
3804 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3805 # ifdef HAVE_SYNC_READ_WRITE
3806         .read           = new_sync_read,
3807         .write          = new_sync_write,
3808 # endif /* HAVE_SYNC_READ_WRITE */
3809         .read_iter      = ll_file_read_iter,
3810         .write_iter     = ll_file_write_iter,
3811 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3812         .read           = ll_file_read,
3813         .aio_read       = ll_file_aio_read,
3814         .write          = ll_file_write,
3815         .aio_write      = ll_file_aio_write,
3816 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3817         .unlocked_ioctl = ll_file_ioctl,
3818         .open           = ll_file_open,
3819         .release        = ll_file_release,
3820         .mmap           = ll_file_mmap,
3821         .llseek         = ll_file_seek,
3822         .splice_read    = ll_file_splice_read,
3823         .fsync          = ll_fsync,
3824         .flush          = ll_flush,
3825         .flock          = ll_file_flock,
3826         .lock           = ll_file_flock
3827 };
3828
3829 /* These are for -o noflock - to return ENOSYS on flock calls */
3830 struct file_operations ll_file_operations_noflock = {
3831 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3832 # ifdef HAVE_SYNC_READ_WRITE
3833         .read           = new_sync_read,
3834         .write          = new_sync_write,
3835 # endif /* HAVE_SYNC_READ_WRITE */
3836         .read_iter      = ll_file_read_iter,
3837         .write_iter     = ll_file_write_iter,
3838 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3839         .read           = ll_file_read,
3840         .aio_read       = ll_file_aio_read,
3841         .write          = ll_file_write,
3842         .aio_write      = ll_file_aio_write,
3843 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3844         .unlocked_ioctl = ll_file_ioctl,
3845         .open           = ll_file_open,
3846         .release        = ll_file_release,
3847         .mmap           = ll_file_mmap,
3848         .llseek         = ll_file_seek,
3849         .splice_read    = ll_file_splice_read,
3850         .fsync          = ll_fsync,
3851         .flush          = ll_flush,
3852         .flock          = ll_file_noflock,
3853         .lock           = ll_file_noflock
3854 };
3855
3856 struct inode_operations ll_file_inode_operations = {
3857         .setattr        = ll_setattr,
3858         .getattr        = ll_getattr,
3859         .permission     = ll_inode_permission,
3860         .setxattr       = ll_setxattr,
3861         .getxattr       = ll_getxattr,
3862         .listxattr      = ll_listxattr,
3863         .removexattr    = ll_removexattr,
3864         .fiemap         = ll_fiemap,
3865 #ifdef HAVE_IOP_GET_ACL
3866         .get_acl        = ll_get_acl,
3867 #endif
3868 };
3869
3870 /* dynamic ioctl number support routins */
3871 static struct llioc_ctl_data {
3872         struct rw_semaphore     ioc_sem;
3873         struct list_head        ioc_head;
3874 } llioc = {
3875         __RWSEM_INITIALIZER(llioc.ioc_sem),
3876         LIST_HEAD_INIT(llioc.ioc_head)
3877 };
3878
3879
3880 struct llioc_data {
3881         struct list_head        iocd_list;
3882         unsigned int            iocd_size;
3883         llioc_callback_t        iocd_cb;
3884         unsigned int            iocd_count;
3885         unsigned int            iocd_cmd[0];
3886 };
3887
3888 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3889 {
3890         unsigned int size;
3891         struct llioc_data *in_data = NULL;
3892         ENTRY;
3893
3894         if (cb == NULL || cmd == NULL ||
3895             count > LLIOC_MAX_CMD || count < 0)
3896                 RETURN(NULL);
3897
3898         size = sizeof(*in_data) + count * sizeof(unsigned int);
3899         OBD_ALLOC(in_data, size);
3900         if (in_data == NULL)
3901                 RETURN(NULL);
3902
3903         memset(in_data, 0, sizeof(*in_data));
3904         in_data->iocd_size = size;
3905         in_data->iocd_cb = cb;
3906         in_data->iocd_count = count;
3907         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3908
3909         down_write(&llioc.ioc_sem);
3910         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3911         up_write(&llioc.ioc_sem);
3912
3913         RETURN(in_data);
3914 }
3915
3916 void ll_iocontrol_unregister(void *magic)
3917 {
3918         struct llioc_data *tmp;
3919
3920         if (magic == NULL)
3921                 return;
3922
3923         down_write(&llioc.ioc_sem);
3924         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3925                 if (tmp == magic) {
3926                         unsigned int size = tmp->iocd_size;
3927
3928                         list_del(&tmp->iocd_list);
3929                         up_write(&llioc.ioc_sem);
3930
3931                         OBD_FREE(tmp, size);
3932                         return;
3933                 }
3934         }
3935         up_write(&llioc.ioc_sem);
3936
3937         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3938 }
3939
3940 EXPORT_SYMBOL(ll_iocontrol_register);
3941 EXPORT_SYMBOL(ll_iocontrol_unregister);
3942
3943 static enum llioc_iter
3944 ll_iocontrol_call(struct inode *inode, struct file *file,
3945                   unsigned int cmd, unsigned long arg, int *rcp)
3946 {
3947         enum llioc_iter ret = LLIOC_CONT;
3948         struct llioc_data *data;
3949         int rc = -EINVAL, i;
3950
3951         down_read(&llioc.ioc_sem);
3952         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3953                 for (i = 0; i < data->iocd_count; i++) {
3954                         if (cmd != data->iocd_cmd[i])
3955                                 continue;
3956
3957                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3958                         break;
3959                 }
3960
3961                 if (ret == LLIOC_STOP)
3962                         break;
3963         }
3964         up_read(&llioc.ioc_sem);
3965
3966         if (rcp)
3967                 *rcp = rc;
3968         return ret;
3969 }
3970
3971 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3972 {
3973         struct ll_inode_info *lli = ll_i2info(inode);
3974         struct cl_object *obj = lli->lli_clob;
3975         struct lu_env *env;
3976         int rc;
3977         __u16 refcheck;
3978         ENTRY;
3979
3980         if (obj == NULL)
3981                 RETURN(0);
3982
3983         env = cl_env_get(&refcheck);
3984         if (IS_ERR(env))
3985                 RETURN(PTR_ERR(env));
3986
3987         rc = cl_conf_set(env, lli->lli_clob, conf);
3988         if (rc < 0)
3989                 GOTO(out, rc);
3990
3991         if (conf->coc_opc == OBJECT_CONF_SET) {
3992                 struct ldlm_lock *lock = conf->coc_lock;
3993                 struct cl_layout cl = {
3994                         .cl_layout_gen = 0,
3995                 };
3996
3997                 LASSERT(lock != NULL);
3998                 LASSERT(ldlm_has_layout(lock));
3999
4000                 /* it can only be allowed to match after layout is
4001                  * applied to inode otherwise false layout would be
4002                  * seen. Applying layout shoud happen before dropping
4003                  * the intent lock. */
4004                 ldlm_lock_allow_match(lock);
4005
4006                 rc = cl_object_layout_get(env, obj, &cl);
4007                 if (rc < 0)
4008                         GOTO(out, rc);
4009
4010                 CDEBUG(D_VFSTRACE,
4011                        DFID": layout version change: %u -> %u\n",
4012                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4013                        cl.cl_layout_gen);
4014                 ll_layout_version_set(lli, cl.cl_layout_gen);
4015         }
4016
4017 out:
4018         cl_env_put(env, &refcheck);
4019
4020         RETURN(rc);
4021 }
4022
4023 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4024 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4025
4026 {
4027         struct ll_sb_info *sbi = ll_i2sbi(inode);
4028         struct ptlrpc_request *req;
4029         struct mdt_body *body;
4030         void *lvbdata;
4031         void *lmm;
4032         int lmmsize;
4033         int rc;
4034         ENTRY;
4035
4036         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4037                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4038                lock->l_lvb_data, lock->l_lvb_len);
4039
4040         if (lock->l_lvb_data != NULL)
4041                 RETURN(0);
4042
4043         /* if layout lock was granted right away, the layout is returned
4044          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4045          * blocked and then granted via completion ast, we have to fetch
4046          * layout here. Please note that we can't use the LVB buffer in
4047          * completion AST because it doesn't have a large enough buffer */
4048         rc = ll_get_default_mdsize(sbi, &lmmsize);
4049         if (rc == 0)
4050                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4051                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4052                                 lmmsize, 0, &req);
4053         if (rc < 0)
4054                 RETURN(rc);
4055
4056         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4057         if (body == NULL)
4058                 GOTO(out, rc = -EPROTO);
4059
4060         lmmsize = body->mbo_eadatasize;
4061         if (lmmsize == 0) /* empty layout */
4062                 GOTO(out, rc = 0);
4063
4064         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4065         if (lmm == NULL)
4066                 GOTO(out, rc = -EFAULT);
4067
4068         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4069         if (lvbdata == NULL)
4070                 GOTO(out, rc = -ENOMEM);
4071
4072         memcpy(lvbdata, lmm, lmmsize);
4073         lock_res_and_lock(lock);
4074         if (unlikely(lock->l_lvb_data == NULL)) {
4075                 lock->l_lvb_type = LVB_T_LAYOUT;
4076                 lock->l_lvb_data = lvbdata;
4077                 lock->l_lvb_len = lmmsize;
4078                 lvbdata = NULL;
4079         }
4080         unlock_res_and_lock(lock);
4081
4082         if (lvbdata)
4083                 OBD_FREE_LARGE(lvbdata, lmmsize);
4084
4085         EXIT;
4086
4087 out:
4088         ptlrpc_req_finished(req);
4089         return rc;
4090 }
4091
4092 /**
4093  * Apply the layout to the inode. Layout lock is held and will be released
4094  * in this function.
4095  */
4096 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4097                               struct inode *inode)
4098 {
4099         struct ll_inode_info *lli = ll_i2info(inode);
4100         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4101         struct ldlm_lock *lock;
4102         struct cl_object_conf conf;
4103         int rc = 0;
4104         bool lvb_ready;
4105         bool wait_layout = false;
4106         ENTRY;
4107
4108         LASSERT(lustre_handle_is_used(lockh));
4109
4110         lock = ldlm_handle2lock(lockh);
4111         LASSERT(lock != NULL);
4112         LASSERT(ldlm_has_layout(lock));
4113
4114         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4115                    PFID(&lli->lli_fid), inode);
4116
4117         /* in case this is a caching lock and reinstate with new inode */
4118         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4119
4120         lock_res_and_lock(lock);
4121         lvb_ready = ldlm_is_lvb_ready(lock);
4122         unlock_res_and_lock(lock);
4123         /* checking lvb_ready is racy but this is okay. The worst case is
4124          * that multi processes may configure the file on the same time. */
4125
4126         if (lvb_ready)
4127                 GOTO(out, rc = 0);
4128
4129         rc = ll_layout_fetch(inode, lock);
4130         if (rc < 0)
4131                 GOTO(out, rc);
4132
4133         /* for layout lock, lmm is stored in lock's lvb.
4134          * lvb_data is immutable if the lock is held so it's safe to access it
4135          * without res lock.
4136          *
4137          * set layout to file. Unlikely this will fail as old layout was
4138          * surely eliminated */
4139         memset(&conf, 0, sizeof conf);
4140         conf.coc_opc = OBJECT_CONF_SET;
4141         conf.coc_inode = inode;
4142         conf.coc_lock = lock;
4143         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4144         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4145         rc = ll_layout_conf(inode, &conf);
4146
4147         /* refresh layout failed, need to wait */
4148         wait_layout = rc == -EBUSY;
4149         EXIT;
4150
4151 out:
4152         LDLM_LOCK_PUT(lock);
4153         ldlm_lock_decref(lockh, mode);
4154
4155         /* wait for IO to complete if it's still being used. */
4156         if (wait_layout) {
4157                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4158                        ll_get_fsname(inode->i_sb, NULL, 0),
4159                        PFID(&lli->lli_fid), inode);
4160
4161                 memset(&conf, 0, sizeof conf);
4162                 conf.coc_opc = OBJECT_CONF_WAIT;
4163                 conf.coc_inode = inode;
4164                 rc = ll_layout_conf(inode, &conf);
4165                 if (rc == 0)
4166                         rc = -EAGAIN;
4167
4168                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4169                        ll_get_fsname(inode->i_sb, NULL, 0),
4170                        PFID(&lli->lli_fid), rc);
4171         }
4172         RETURN(rc);
4173 }
4174
4175 static int ll_layout_refresh_locked(struct inode *inode)
4176 {
4177         struct ll_inode_info  *lli = ll_i2info(inode);
4178         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4179         struct md_op_data     *op_data;
4180         struct lookup_intent    it;
4181         struct lustre_handle    lockh;
4182         enum ldlm_mode          mode;
4183         struct ptlrpc_request *req;
4184         int rc;
4185         ENTRY;
4186
4187 again:
4188         /* mostly layout lock is caching on the local side, so try to match
4189          * it before grabbing layout lock mutex. */
4190         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4191                                LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4192         if (mode != 0) { /* hit cached lock */
4193                 rc = ll_layout_lock_set(&lockh, mode, inode);
4194                 if (rc == -EAGAIN)
4195                         goto again;
4196
4197                 RETURN(rc);
4198         }
4199
4200         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4201                                      0, 0, LUSTRE_OPC_ANY, NULL);
4202         if (IS_ERR(op_data))
4203                 RETURN(PTR_ERR(op_data));
4204
4205         /* have to enqueue one */
4206         memset(&it, 0, sizeof(it));
4207         it.it_op = IT_LAYOUT;
4208
4209         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4210                           ll_get_fsname(inode->i_sb, NULL, 0),
4211                           PFID(&lli->lli_fid), inode);
4212
4213         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4214                             &ll_md_blocking_ast, 0);
4215         if (it.it_request != NULL)
4216                 ptlrpc_req_finished(it.it_request);
4217         it.it_request = NULL;
4218
4219         ll_finish_md_op_data(op_data);
4220
4221         mode = it.it_lock_mode;
4222         it.it_lock_mode = 0;
4223         ll_intent_drop_lock(&it);
4224
4225         if (rc == 0) {
4226                 /* set lock data in case this is a new lock */
4227                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4228                 lockh.cookie = it.it_lock_handle;
4229                 rc = ll_layout_lock_set(&lockh, mode, inode);
4230                 if (rc == -EAGAIN)
4231                         goto again;
4232         }
4233
4234         RETURN(rc);
4235 }
4236
4237 /**
4238  * This function checks if there exists a LAYOUT lock on the client side,
4239  * or enqueues it if it doesn't have one in cache.
4240  *
4241  * This function will not hold layout lock so it may be revoked any time after
4242  * this function returns. Any operations depend on layout should be redone
4243  * in that case.
4244  *
4245  * This function should be called before lov_io_init() to get an uptodate
4246  * layout version, the caller should save the version number and after IO
4247  * is finished, this function should be called again to verify that layout
4248  * is not changed during IO time.
4249  */
4250 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4251 {
4252         struct ll_inode_info    *lli = ll_i2info(inode);
4253         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4254         int rc;
4255         ENTRY;
4256
4257         *gen = ll_layout_version_get(lli);
4258         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4259                 RETURN(0);
4260
4261         /* sanity checks */
4262         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4263         LASSERT(S_ISREG(inode->i_mode));
4264
4265         /* take layout lock mutex to enqueue layout lock exclusively. */
4266         mutex_lock(&lli->lli_layout_mutex);
4267
4268         rc = ll_layout_refresh_locked(inode);
4269         if (rc < 0)
4270                 GOTO(out, rc);
4271
4272         *gen = ll_layout_version_get(lli);
4273 out:
4274         mutex_unlock(&lli->lli_layout_mutex);
4275
4276         RETURN(rc);
4277 }
4278
4279 /**
4280  *  This function send a restore request to the MDT
4281  */
4282 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4283 {
4284         struct hsm_user_request *hur;
4285         int                      len, rc;
4286         ENTRY;
4287
4288         len = sizeof(struct hsm_user_request) +
4289               sizeof(struct hsm_user_item);
4290         OBD_ALLOC(hur, len);
4291         if (hur == NULL)
4292                 RETURN(-ENOMEM);
4293
4294         hur->hur_request.hr_action = HUA_RESTORE;
4295         hur->hur_request.hr_archive_id = 0;
4296         hur->hur_request.hr_flags = 0;
4297         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4298                sizeof(hur->hur_user_item[0].hui_fid));
4299         hur->hur_user_item[0].hui_extent.offset = offset;
4300         hur->hur_user_item[0].hui_extent.length = length;
4301         hur->hur_request.hr_itemcount = 1;
4302         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
4303                            len, hur, NULL);
4304         OBD_FREE(hur, len);
4305         RETURN(rc);
4306 }