lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2016, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 static int
  57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  58
  59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  60                           bool *lease_broken);
  61
  62 static struct ll_file_data *ll_file_data_get(void)
  63 {
  64         struct ll_file_data *fd;
  65
  66         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  67         if (fd == NULL)
  68                 return NULL;
  69
  70         fd->fd_write_failed = false;
  71
  72         return fd;
  73 }
  74
  75 static void ll_file_data_put(struct ll_file_data *fd)
  76 {
  77         if (fd != NULL)
  78                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  79 }
  80
  81 /**
  82  * Packs all the attributes into @op_data for the CLOSE rpc.
  83  */
  84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  85                              struct obd_client_handle *och)
  86 {
  87         ENTRY;
  88
  89         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  90                            0, 0, LUSTRE_OPC_ANY, NULL);
  91
  92         op_data->op_attr.ia_mode = inode->i_mode;
  93         op_data->op_attr.ia_atime = inode->i_atime;
  94         op_data->op_attr.ia_mtime = inode->i_mtime;
  95         op_data->op_attr.ia_ctime = inode->i_ctime;
  96         op_data->op_attr.ia_size = i_size_read(inode);
  97         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
  98                                      ATTR_MTIME | ATTR_MTIME_SET |
  99                                      ATTR_CTIME | ATTR_CTIME_SET;
 100         op_data->op_attr_blocks = inode->i_blocks;
 101         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 102         op_data->op_handle = och->och_fh;
 103
 104         if (och->och_flags & FMODE_WRITE &&
 105             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 106                 /* For HSM: if inode data has been modified, pack it so that
 107                  * MDT can set data dirty flag in the archive. */
 108                 op_data->op_bias |= MDS_DATA_MODIFIED;
 109
 110         EXIT;
 111 }
 112
 113 /**
 114  * Perform a close, possibly with a bias.
 115  * The meaning of "data" depends on the value of "bias".
 116  *
 117  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 118  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 119  * swap layouts with.
 120  */
 121 static int ll_close_inode_openhandle(struct inode *inode,
 122                                      struct obd_client_handle *och,
 123                                      enum mds_op_bias bias, void *data)
 124 {
 125         struct obd_export *md_exp = ll_i2mdexp(inode);
 126         const struct ll_inode_info *lli = ll_i2info(inode);
 127         struct md_op_data *op_data;
 128         struct ptlrpc_request *req = NULL;
 129         int rc;
 130         ENTRY;
 131
 132         if (class_exp2obd(md_exp) == NULL) {
 133                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 134                        ll_get_fsname(inode->i_sb, NULL, 0),
 135                        PFID(&lli->lli_fid));
 136                 GOTO(out, rc = 0);
 137         }
 138
 139         OBD_ALLOC_PTR(op_data);
 140         /* We leak openhandle and request here on error, but not much to be
 141          * done in OOM case since app won't retry close on error either. */
 142         if (op_data == NULL)
 143                 GOTO(out, rc = -ENOMEM);
 144
 145         ll_prepare_close(inode, op_data, och);
 146         switch (bias) {
 147         case MDS_CLOSE_LAYOUT_MERGE:
 148                 /* merge blocks from the victim inode */
 149                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 150                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 151         case MDS_CLOSE_LAYOUT_SWAP:
 152                 LASSERT(data != NULL);
 153                 op_data->op_bias |= bias;
 154                 op_data->op_data_version = 0;
 155                 op_data->op_lease_handle = och->och_lease_handle;
 156                 op_data->op_fid2 = *ll_inode2fid(data);
 157                 break;
 158
 159         case MDS_CLOSE_RESYNC_DONE: {
 160                 struct ll_ioc_lease *ioc = data;
 161
 162                 LASSERT(data != NULL);
 163                 op_data->op_attr_blocks +=
 164                         ioc->lil_count * op_data->op_attr_blocks;
 165                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 166                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 167
 168                 op_data->op_lease_handle = och->och_lease_handle;
 169                 op_data->op_data = &ioc->lil_ids[0];
 170                 op_data->op_data_size =
 171                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 172                 break;
 173         }
 174
 175         case MDS_HSM_RELEASE:
 176                 LASSERT(data != NULL);
 177                 op_data->op_bias |= MDS_HSM_RELEASE;
 178                 op_data->op_data_version = *(__u64 *)data;
 179                 op_data->op_lease_handle = och->och_lease_handle;
 180                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 181                 break;
 182
 183         default:
 184                 LASSERT(data == NULL);
 185                 break;
 186         }
 187
 188         rc = md_close(md_exp, op_data, och->och_mod, &req);
 189         if (rc != 0 && rc != -EINTR)
 190                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 191                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 192
 193         if (rc == 0 && op_data->op_bias & bias) {
 194                 struct mdt_body *body;
 195
 196                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 197                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 198                         rc = -EBUSY;
 199         }
 200
 201         ll_finish_md_op_data(op_data);
 202         EXIT;
 203 out:
 204
 205         md_clear_open_replay_data(md_exp, och);
 206         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 207         OBD_FREE_PTR(och);
 208
 209         ptlrpc_req_finished(req);       /* This is close request */
 210         return rc;
 211 }
 212
 213 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 214 {
 215         struct ll_inode_info *lli = ll_i2info(inode);
 216         struct obd_client_handle **och_p;
 217         struct obd_client_handle *och;
 218         __u64 *och_usecount;
 219         int rc = 0;
 220         ENTRY;
 221
 222         if (fmode & FMODE_WRITE) {
 223                 och_p = &lli->lli_mds_write_och;
 224                 och_usecount = &lli->lli_open_fd_write_count;
 225         } else if (fmode & FMODE_EXEC) {
 226                 och_p = &lli->lli_mds_exec_och;
 227                 och_usecount = &lli->lli_open_fd_exec_count;
 228         } else {
 229                 LASSERT(fmode & FMODE_READ);
 230                 och_p = &lli->lli_mds_read_och;
 231                 och_usecount = &lli->lli_open_fd_read_count;
 232         }
 233
 234         mutex_lock(&lli->lli_och_mutex);
 235         if (*och_usecount > 0) {
 236                 /* There are still users of this handle, so skip
 237                  * freeing it. */
 238                 mutex_unlock(&lli->lli_och_mutex);
 239                 RETURN(0);
 240         }
 241
 242         och = *och_p;
 243         *och_p = NULL;
 244         mutex_unlock(&lli->lli_och_mutex);
 245
 246         if (och != NULL) {
 247                 /* There might be a race and this handle may already
 248                  * be closed. */
 249                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 250         }
 251
 252         RETURN(rc);
 253 }
 254
 255 static int ll_md_close(struct inode *inode, struct file *file)
 256 {
 257         union ldlm_policy_data policy = {
 258                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 259         };
 260         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 261         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 262         struct ll_inode_info *lli = ll_i2info(inode);
 263         struct lustre_handle lockh;
 264         enum ldlm_mode lockmode;
 265         int rc = 0;
 266         ENTRY;
 267
 268         /* clear group lock, if present */
 269         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 270                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 271
 272         if (fd->fd_lease_och != NULL) {
 273                 bool lease_broken;
 274
 275                 /* Usually the lease is not released when the
 276                  * application crashed, we need to release here. */
 277                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 278                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 279                         PFID(&lli->lli_fid), rc, lease_broken);
 280
 281                 fd->fd_lease_och = NULL;
 282         }
 283
 284         if (fd->fd_och != NULL) {
 285                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 286                 fd->fd_och = NULL;
 287                 GOTO(out, rc);
 288         }
 289
 290         /* Let's see if we have good enough OPEN lock on the file and if
 291            we can skip talking to MDS */
 292         mutex_lock(&lli->lli_och_mutex);
 293         if (fd->fd_omode & FMODE_WRITE) {
 294                 lockmode = LCK_CW;
 295                 LASSERT(lli->lli_open_fd_write_count);
 296                 lli->lli_open_fd_write_count--;
 297         } else if (fd->fd_omode & FMODE_EXEC) {
 298                 lockmode = LCK_PR;
 299                 LASSERT(lli->lli_open_fd_exec_count);
 300                 lli->lli_open_fd_exec_count--;
 301         } else {
 302                 lockmode = LCK_CR;
 303                 LASSERT(lli->lli_open_fd_read_count);
 304                 lli->lli_open_fd_read_count--;
 305         }
 306         mutex_unlock(&lli->lli_och_mutex);
 307
 308         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 309                            LDLM_IBITS, &policy, lockmode, &lockh))
 310                 rc = ll_md_real_close(inode, fd->fd_omode);
 311
 312 out:
 313         LUSTRE_FPRIVATE(file) = NULL;
 314         ll_file_data_put(fd);
 315
 316         RETURN(rc);
 317 }
 318
 319 /* While this returns an error code, fput() the caller does not, so we need
 320  * to make every effort to clean up all of our state here.  Also, applications
 321  * rarely check close errors and even if an error is returned they will not
 322  * re-try the close call.
 323  */
 324 int ll_file_release(struct inode *inode, struct file *file)
 325 {
 326         struct ll_file_data *fd;
 327         struct ll_sb_info *sbi = ll_i2sbi(inode);
 328         struct ll_inode_info *lli = ll_i2info(inode);
 329         int rc;
 330         ENTRY;
 331
 332         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 333                PFID(ll_inode2fid(inode)), inode);
 334
 335         if (inode->i_sb->s_root != file_dentry(file))
 336                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 337         fd = LUSTRE_FPRIVATE(file);
 338         LASSERT(fd != NULL);
 339
 340         /* The last ref on @file, maybe not the the owner pid of statahead,
 341          * because parent and child process can share the same file handle. */
 342         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 343                 ll_deauthorize_statahead(inode, fd);
 344
 345         if (inode->i_sb->s_root == file_dentry(file)) {
 346                 LUSTRE_FPRIVATE(file) = NULL;
 347                 ll_file_data_put(fd);
 348                 RETURN(0);
 349         }
 350
 351         if (!S_ISDIR(inode->i_mode)) {
 352                 if (lli->lli_clob != NULL)
 353                         lov_read_and_clear_async_rc(lli->lli_clob);
 354                 lli->lli_async_rc = 0;
 355         }
 356
 357         rc = ll_md_close(inode, file);
 358
 359         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 360                 libcfs_debug_dumplog();
 361
 362         RETURN(rc);
 363 }
 364
 365 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 366                                 struct lookup_intent *itp)
 367 {
 368         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 369         struct dentry *parent = de->d_parent;
 370         const char *name = NULL;
 371         int len = 0;
 372         struct md_op_data *op_data;
 373         struct ptlrpc_request *req = NULL;
 374         int rc;
 375         ENTRY;
 376
 377         LASSERT(parent != NULL);
 378         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 379
 380         /* if server supports open-by-fid, or file name is invalid, don't pack
 381          * name in open request */
 382         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 383             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 384                 name = de->d_name.name;
 385                 len = de->d_name.len;
 386         }
 387
 388         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 389                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 390         if (IS_ERR(op_data))
 391                 RETURN(PTR_ERR(op_data));
 392         op_data->op_data = lmm;
 393         op_data->op_data_size = lmmsize;
 394
 395         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 396                             &ll_md_blocking_ast, 0);
 397         ll_finish_md_op_data(op_data);
 398         if (rc == -ESTALE) {
 399                 /* reason for keep own exit path - don`t flood log
 400                  * with messages with -ESTALE errors.
 401                  */
 402                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 403                      it_open_error(DISP_OPEN_OPEN, itp))
 404                         GOTO(out, rc);
 405                 ll_release_openhandle(de, itp);
 406                 GOTO(out, rc);
 407         }
 408
 409         if (it_disposition(itp, DISP_LOOKUP_NEG))
 410                 GOTO(out, rc = -ENOENT);
 411
 412         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 413                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 414                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 415                 GOTO(out, rc);
 416         }
 417
 418         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 419         if (!rc && itp->it_lock_mode)
 420                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 421
 422 out:
 423         ptlrpc_req_finished(req);
 424         ll_intent_drop_lock(itp);
 425
 426         /* We did open by fid, but by the time we got to the server,
 427          * the object disappeared. If this is a create, we cannot really
 428          * tell the userspace that the file it was trying to create
 429          * does not exist. Instead let's return -ESTALE, and the VFS will
 430          * retry the create with LOOKUP_REVAL that we are going to catch
 431          * in ll_revalidate_dentry() and use lookup then.
 432          */
 433         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 434                 rc = -ESTALE;
 435
 436         RETURN(rc);
 437 }
 438
 439 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 440                        struct obd_client_handle *och)
 441 {
 442         struct mdt_body *body;
 443
 444         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 445         och->och_fh = body->mbo_handle;
 446         och->och_fid = body->mbo_fid1;
 447         och->och_lease_handle.cookie = it->it_lock_handle;
 448         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 449         och->och_flags = it->it_flags;
 450
 451         return md_set_open_replay_data(md_exp, och, it);
 452 }
 453
 454 static int ll_local_open(struct file *file, struct lookup_intent *it,
 455                          struct ll_file_data *fd, struct obd_client_handle *och)
 456 {
 457         struct inode *inode = file_inode(file);
 458         ENTRY;
 459
 460         LASSERT(!LUSTRE_FPRIVATE(file));
 461
 462         LASSERT(fd != NULL);
 463
 464         if (och) {
 465                 int rc;
 466
 467                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 468                 if (rc != 0)
 469                         RETURN(rc);
 470         }
 471
 472         LUSTRE_FPRIVATE(file) = fd;
 473         ll_readahead_init(inode, &fd->fd_ras);
 474         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 475
 476         /* ll_cl_context initialize */
 477         rwlock_init(&fd->fd_lock);
 478         INIT_LIST_HEAD(&fd->fd_lccs);
 479
 480         RETURN(0);
 481 }
 482
 483 /* Open a file, and (for the very first open) create objects on the OSTs at
 484  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 485  * creation or open until ll_lov_setstripe() ioctl is called.
 486  *
 487  * If we already have the stripe MD locally then we don't request it in
 488  * md_open(), by passing a lmm_size = 0.
 489  *
 490  * It is up to the application to ensure no other processes open this file
 491  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 492  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 493  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 494  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 495  */
 496 int ll_file_open(struct inode *inode, struct file *file)
 497 {
 498         struct ll_inode_info *lli = ll_i2info(inode);
 499         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 500                                           .it_flags = file->f_flags };
 501         struct obd_client_handle **och_p = NULL;
 502         __u64 *och_usecount = NULL;
 503         struct ll_file_data *fd;
 504         int rc = 0;
 505         ENTRY;
 506
 507         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 508                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 509
 510         it = file->private_data; /* XXX: compat macro */
 511         file->private_data = NULL; /* prevent ll_local_open assertion */
 512
 513         fd = ll_file_data_get();
 514         if (fd == NULL)
 515                 GOTO(out_openerr, rc = -ENOMEM);
 516
 517         fd->fd_file = file;
 518         if (S_ISDIR(inode->i_mode))
 519                 ll_authorize_statahead(inode, fd);
 520
 521         if (inode->i_sb->s_root == file_dentry(file)) {
 522                 LUSTRE_FPRIVATE(file) = fd;
 523                 RETURN(0);
 524         }
 525
 526         if (!it || !it->it_disposition) {
 527                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 528                  * because everything but O_ACCMODE mask was stripped from
 529                  * there */
 530                 if ((oit.it_flags + 1) & O_ACCMODE)
 531                         oit.it_flags++;
 532                 if (file->f_flags & O_TRUNC)
 533                         oit.it_flags |= FMODE_WRITE;
 534
 535                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 536                  * dentry_open after call to open_namei that checks permissions.
 537                  * Only nfsd_open call dentry_open directly without checking
 538                  * permissions and because of that this code below is safe. */
 539                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 540                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 541
 542                 /* We do not want O_EXCL here, presumably we opened the file
 543                  * already? XXX - NFS implications? */
 544                 oit.it_flags &= ~O_EXCL;
 545
 546                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 547                  * created if necessary, then "IT_CREAT" should be set to keep
 548                  * consistent with it */
 549                 if (oit.it_flags & O_CREAT)
 550                         oit.it_op |= IT_CREAT;
 551
 552                 it = &oit;
 553         }
 554
 555 restart:
 556         /* Let's see if we have file open on MDS already. */
 557         if (it->it_flags & FMODE_WRITE) {
 558                 och_p = &lli->lli_mds_write_och;
 559                 och_usecount = &lli->lli_open_fd_write_count;
 560         } else if (it->it_flags & FMODE_EXEC) {
 561                 och_p = &lli->lli_mds_exec_och;
 562                 och_usecount = &lli->lli_open_fd_exec_count;
 563          } else {
 564                 och_p = &lli->lli_mds_read_och;
 565                 och_usecount = &lli->lli_open_fd_read_count;
 566         }
 567
 568         mutex_lock(&lli->lli_och_mutex);
 569         if (*och_p) { /* Open handle is present */
 570                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 571                         /* Well, there's extra open request that we do not need,
 572                            let's close it somehow. This will decref request. */
 573                         rc = it_open_error(DISP_OPEN_OPEN, it);
 574                         if (rc) {
 575                                 mutex_unlock(&lli->lli_och_mutex);
 576                                 GOTO(out_openerr, rc);
 577                         }
 578
 579                         ll_release_openhandle(file_dentry(file), it);
 580                 }
 581                 (*och_usecount)++;
 582
 583                 rc = ll_local_open(file, it, fd, NULL);
 584                 if (rc) {
 585                         (*och_usecount)--;
 586                         mutex_unlock(&lli->lli_och_mutex);
 587                         GOTO(out_openerr, rc);
 588                 }
 589         } else {
 590                 LASSERT(*och_usecount == 0);
 591                 if (!it->it_disposition) {
 592                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 593                         /* We cannot just request lock handle now, new ELC code
 594                            means that one of other OPEN locks for this file
 595                            could be cancelled, and since blocking ast handler
 596                            would attempt to grab och_mutex as well, that would
 597                            result in a deadlock */
 598                         mutex_unlock(&lli->lli_och_mutex);
 599                         /*
 600                          * Normally called under two situations:
 601                          * 1. NFS export.
 602                          * 2. A race/condition on MDS resulting in no open
 603                          *    handle to be returned from LOOKUP|OPEN request,
 604                          *    for example if the target entry was a symlink.
 605                          *
 606                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 607                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 608                          *  bit so that it's not confusing later callers.
 609                          *
 610                          *  NB; when ldd is NULL, it must have come via normal
 611                          *  lookup path only, since ll_iget_for_nfs always calls
 612                          *  ll_d_init().
 613                          */
 614                         if (ldd && ldd->lld_nfs_dentry) {
 615                                 ldd->lld_nfs_dentry = 0;
 616                                 it->it_flags |= MDS_OPEN_LOCK;
 617                         }
 618
 619                          /*
 620                          * Always specify MDS_OPEN_BY_FID because we don't want
 621                          * to get file with different fid.
 622                          */
 623                         it->it_flags |= MDS_OPEN_BY_FID;
 624                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 625                                                  it);
 626                         if (rc)
 627                                 GOTO(out_openerr, rc);
 628
 629                         goto restart;
 630                 }
 631                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 632                 if (!*och_p)
 633                         GOTO(out_och_free, rc = -ENOMEM);
 634
 635                 (*och_usecount)++;
 636
 637                 /* md_intent_lock() didn't get a request ref if there was an
 638                  * open error, so don't do cleanup on the request here
 639                  * (bug 3430) */
 640                 /* XXX (green): Should not we bail out on any error here, not
 641                  * just open error? */
 642                 rc = it_open_error(DISP_OPEN_OPEN, it);
 643                 if (rc != 0)
 644                         GOTO(out_och_free, rc);
 645
 646                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 647                          "inode %p: disposition %x, status %d\n", inode,
 648                          it_disposition(it, ~0), it->it_status);
 649
 650                 rc = ll_local_open(file, it, fd, *och_p);
 651                 if (rc)
 652                         GOTO(out_och_free, rc);
 653         }
 654         mutex_unlock(&lli->lli_och_mutex);
 655         fd = NULL;
 656
 657         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 658            different kind of OPEN lock for this same inode gets cancelled
 659            by ldlm_cancel_lru */
 660         if (!S_ISREG(inode->i_mode))
 661                 GOTO(out_och_free, rc);
 662
 663         cl_lov_delay_create_clear(&file->f_flags);
 664         GOTO(out_och_free, rc);
 665
 666 out_och_free:
 667         if (rc) {
 668                 if (och_p && *och_p) {
 669                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 670                         *och_p = NULL; /* OBD_FREE writes some magic there */
 671                         (*och_usecount)--;
 672                 }
 673                 mutex_unlock(&lli->lli_och_mutex);
 674
 675 out_openerr:
 676                 if (lli->lli_opendir_key == fd)
 677                         ll_deauthorize_statahead(inode, fd);
 678                 if (fd != NULL)
 679                         ll_file_data_put(fd);
 680         } else {
 681                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 682         }
 683
 684         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 685                 ptlrpc_req_finished(it->it_request);
 686                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 687         }
 688
 689         return rc;
 690 }
 691
 692 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 693                         struct ldlm_lock_desc *desc, void *data, int flag)
 694 {
 695         int rc;
 696         struct lustre_handle lockh;
 697         ENTRY;
 698
 699         switch (flag) {
 700         case LDLM_CB_BLOCKING:
 701                 ldlm_lock2handle(lock, &lockh);
 702                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 703                 if (rc < 0) {
 704                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 705                         RETURN(rc);
 706                 }
 707                 break;
 708         case LDLM_CB_CANCELING:
 709                 /* do nothing */
 710                 break;
 711         }
 712         RETURN(0);
 713 }
 714
 715 /**
 716  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 717  * and save it as fd->fd_och so as to force client to reopen the file even
 718  * if it has an open lock in cache already.
 719  */
 720 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 721                                 struct lustre_handle *old_handle)
 722 {
 723         struct ll_inode_info *lli = ll_i2info(inode);
 724         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 725         struct obd_client_handle **och_p;
 726         __u64 *och_usecount;
 727         int rc = 0;
 728         ENTRY;
 729
 730         /* Get the openhandle of the file */
 731         mutex_lock(&lli->lli_och_mutex);
 732         if (fd->fd_lease_och != NULL)
 733                 GOTO(out_unlock, rc = -EBUSY);
 734
 735         if (fd->fd_och == NULL) {
 736                 if (file->f_mode & FMODE_WRITE) {
 737                         LASSERT(lli->lli_mds_write_och != NULL);
 738                         och_p = &lli->lli_mds_write_och;
 739                         och_usecount = &lli->lli_open_fd_write_count;
 740                 } else {
 741                         LASSERT(lli->lli_mds_read_och != NULL);
 742                         och_p = &lli->lli_mds_read_och;
 743                         och_usecount = &lli->lli_open_fd_read_count;
 744                 }
 745
 746                 if (*och_usecount > 1)
 747                         GOTO(out_unlock, rc = -EBUSY);
 748
 749                 fd->fd_och = *och_p;
 750                 *och_usecount = 0;
 751                 *och_p = NULL;
 752         }
 753
 754         *old_handle = fd->fd_och->och_fh;
 755
 756         EXIT;
 757 out_unlock:
 758         mutex_unlock(&lli->lli_och_mutex);
 759         return rc;
 760 }
 761
 762 /**
 763  * Release ownership on lli_mds_*_och when putting back a file lease.
 764  */
 765 static int ll_lease_och_release(struct inode *inode, struct file *file)
 766 {
 767         struct ll_inode_info *lli = ll_i2info(inode);
 768         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 769         struct obd_client_handle **och_p;
 770         struct obd_client_handle *old_och = NULL;
 771         __u64 *och_usecount;
 772         int rc = 0;
 773         ENTRY;
 774
 775         mutex_lock(&lli->lli_och_mutex);
 776         if (file->f_mode & FMODE_WRITE) {
 777                 och_p = &lli->lli_mds_write_och;
 778                 och_usecount = &lli->lli_open_fd_write_count;
 779         } else {
 780                 och_p = &lli->lli_mds_read_och;
 781                 och_usecount = &lli->lli_open_fd_read_count;
 782         }
 783
 784         /* The file may have been open by another process (broken lease) so
 785          * *och_p is not NULL. In this case we should simply increase usecount
 786          * and close fd_och.
 787          */
 788         if (*och_p != NULL) {
 789                 old_och = fd->fd_och;
 790                 (*och_usecount)++;
 791         } else {
 792                 *och_p = fd->fd_och;
 793                 *och_usecount = 1;
 794         }
 795         fd->fd_och = NULL;
 796         mutex_unlock(&lli->lli_och_mutex);
 797
 798         if (old_och != NULL)
 799                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 800
 801         RETURN(rc);
 802 }
 803
 804 /**
 805  * Acquire a lease and open the file.
 806  */
 807 static struct obd_client_handle *
 808 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 809               __u64 open_flags)
 810 {
 811         struct lookup_intent it = { .it_op = IT_OPEN };
 812         struct ll_sb_info *sbi = ll_i2sbi(inode);
 813         struct md_op_data *op_data;
 814         struct ptlrpc_request *req = NULL;
 815         struct lustre_handle old_handle = { 0 };
 816         struct obd_client_handle *och = NULL;
 817         int rc;
 818         int rc2;
 819         ENTRY;
 820
 821         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 822                 RETURN(ERR_PTR(-EINVAL));
 823
 824         if (file != NULL) {
 825                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 826                         RETURN(ERR_PTR(-EPERM));
 827
 828                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 829                 if (rc)
 830                         RETURN(ERR_PTR(rc));
 831         }
 832
 833         OBD_ALLOC_PTR(och);
 834         if (och == NULL)
 835                 RETURN(ERR_PTR(-ENOMEM));
 836
 837         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 838                                         LUSTRE_OPC_ANY, NULL);
 839         if (IS_ERR(op_data))
 840                 GOTO(out, rc = PTR_ERR(op_data));
 841
 842         /* To tell the MDT this openhandle is from the same owner */
 843         op_data->op_handle = old_handle;
 844
 845         it.it_flags = fmode | open_flags;
 846         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 847         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 848                             &ll_md_blocking_lease_ast,
 849         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 850          * it can be cancelled which may mislead applications that the lease is
 851          * broken;
 852          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 853          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 854          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 855                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 856         ll_finish_md_op_data(op_data);
 857         ptlrpc_req_finished(req);
 858         if (rc < 0)
 859                 GOTO(out_release_it, rc);
 860
 861         if (it_disposition(&it, DISP_LOOKUP_NEG))
 862                 GOTO(out_release_it, rc = -ENOENT);
 863
 864         rc = it_open_error(DISP_OPEN_OPEN, &it);
 865         if (rc)
 866                 GOTO(out_release_it, rc);
 867
 868         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 869         ll_och_fill(sbi->ll_md_exp, &it, och);
 870
 871         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 872                 GOTO(out_close, rc = -EOPNOTSUPP);
 873
 874         /* already get lease, handle lease lock */
 875         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 876         if (it.it_lock_mode == 0 ||
 877             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 878                 /* open lock must return for lease */
 879                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 880                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 881                         it.it_lock_bits);
 882                 GOTO(out_close, rc = -EPROTO);
 883         }
 884
 885         ll_intent_release(&it);
 886         RETURN(och);
 887
 888 out_close:
 889         /* Cancel open lock */
 890         if (it.it_lock_mode != 0) {
 891                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 892                                             it.it_lock_mode);
 893                 it.it_lock_mode = 0;
 894                 och->och_lease_handle.cookie = 0ULL;
 895         }
 896         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 897         if (rc2 < 0)
 898                 CERROR("%s: error closing file "DFID": %d\n",
 899                        ll_get_fsname(inode->i_sb, NULL, 0),
 900                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 901         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 902 out_release_it:
 903         ll_intent_release(&it);
 904 out:
 905         if (och != NULL)
 906                 OBD_FREE_PTR(och);
 907         RETURN(ERR_PTR(rc));
 908 }
 909
 910 /**
 911  * Check whether a layout swap can be done between two inodes.
 912  *
 913  * \param[in] inode1  First inode to check
 914  * \param[in] inode2  Second inode to check
 915  *
 916  * \retval 0 on success, layout swap can be performed between both inodes
 917  * \retval negative error code if requirements are not met
 918  */
 919 static int ll_check_swap_layouts_validity(struct inode *inode1,
 920                                           struct inode *inode2)
 921 {
 922         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 923                 return -EINVAL;
 924
 925         if (inode_permission(inode1, MAY_WRITE) ||
 926             inode_permission(inode2, MAY_WRITE))
 927                 return -EPERM;
 928
 929         if (inode1->i_sb != inode2->i_sb)
 930                 return -EXDEV;
 931
 932         return 0;
 933 }
 934
 935 static int ll_swap_layouts_close(struct obd_client_handle *och,
 936                                  struct inode *inode, struct inode *inode2,
 937                                  int intent)
 938 {
 939         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 940         const struct lu_fid     *fid2;
 941         enum mds_op_bias         bias;
 942         int                      rc;
 943         ENTRY;
 944
 945         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 946                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 947
 948         rc = ll_check_swap_layouts_validity(inode, inode2);
 949         if (rc < 0)
 950                 GOTO(out_free_och, rc);
 951
 952         /* We now know that inode2 is a lustre inode */
 953         fid2 = ll_inode2fid(inode2);
 954
 955         rc = lu_fid_cmp(fid1, fid2);
 956         if (rc == 0)
 957                 GOTO(out_free_och, rc = -EINVAL);
 958
 959         switch (intent) {
 960         case SWAP_LAYOUTS_CLOSE:
 961                 bias = MDS_CLOSE_LAYOUT_SWAP;
 962                 break;
 963         case MERGE_LAYOUTS_CLOSE:
 964                 bias = MDS_CLOSE_LAYOUT_MERGE;
 965                 break;
 966         default:
 967                 GOTO(out_free_och, rc = -EOPNOTSUPP);
 968         }
 969
 970         /* Close the file and {swap,merge} layouts between inode & inode2.
 971          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 972          * because we still need it to pack l_remote_handle to MDT. */
 973         rc = ll_close_inode_openhandle(inode, och, bias, inode2);
 974
 975         och = NULL; /* freed in ll_close_inode_openhandle() */
 976
 977 out_free_och:
 978         if (och != NULL)
 979                 OBD_FREE_PTR(och);
 980
 981         RETURN(rc);
 982 }
 983
 984 /**
 985  * Release lease and close the file.
 986  * It will check if the lease has ever broken.
 987  */
 988 static int ll_lease_close_intent(struct obd_client_handle *och,
 989                                  struct inode *inode,
 990                                  bool *lease_broken, enum mds_op_bias bias,
 991                                  void *data)
 992 {
 993         struct ldlm_lock *lock;
 994         bool cancelled = true;
 995         int rc;
 996         ENTRY;
 997
 998         lock = ldlm_handle2lock(&och->och_lease_handle);
 999         if (lock != NULL) {
1000                 lock_res_and_lock(lock);
1001                 cancelled = ldlm_is_cancel(lock);
1002                 unlock_res_and_lock(lock);
1003                 LDLM_LOCK_PUT(lock);
1004         }
1005
1006         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1007                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1008
1009         if (lease_broken != NULL)
1010                 *lease_broken = cancelled;
1011
1012         if (!cancelled && !bias)
1013                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1014
1015         if (cancelled) { /* no need to excute intent */
1016                 bias = 0;
1017                 data = NULL;
1018         }
1019
1020         rc = ll_close_inode_openhandle(inode, och, bias, data);
1021         RETURN(rc);
1022 }
1023
1024 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1025                           bool *lease_broken)
1026 {
1027         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1028 }
1029
1030 /**
1031  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1032  */
1033 static int ll_lease_file_resync(struct obd_client_handle *och,
1034                                 struct inode *inode)
1035 {
1036         struct ll_sb_info *sbi = ll_i2sbi(inode);
1037         struct md_op_data *op_data;
1038         __u64 data_version_unused;
1039         int rc;
1040         ENTRY;
1041
1042         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1043                                      LUSTRE_OPC_ANY, NULL);
1044         if (IS_ERR(op_data))
1045                 RETURN(PTR_ERR(op_data));
1046
1047         /* before starting file resync, it's necessary to clean up page cache
1048          * in client memory, otherwise once the layout version is increased,
1049          * writing back cached data will be denied the OSTs. */
1050         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1051         if (rc)
1052                 GOTO(out, rc);
1053
1054         op_data->op_handle = och->och_lease_handle;
1055         rc = md_file_resync(sbi->ll_md_exp, op_data);
1056         if (rc)
1057                 GOTO(out, rc);
1058
1059         EXIT;
1060 out:
1061         ll_finish_md_op_data(op_data);
1062         return rc;
1063 }
1064
1065 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1066 {
1067         struct ll_inode_info *lli = ll_i2info(inode);
1068         struct cl_object *obj = lli->lli_clob;
1069         struct cl_attr *attr = vvp_env_thread_attr(env);
1070         s64 atime;
1071         s64 mtime;
1072         s64 ctime;
1073         int rc = 0;
1074
1075         ENTRY;
1076
1077         ll_inode_size_lock(inode);
1078
1079         /* Merge timestamps the most recently obtained from MDS with
1080          * timestamps obtained from OSTs.
1081          *
1082          * Do not overwrite atime of inode because it may be refreshed
1083          * by file_accessed() function. If the read was served by cache
1084          * data, there is no RPC to be sent so that atime may not be
1085          * transferred to OSTs at all. MDT only updates atime at close time
1086          * if it's at least 'mdd.*.atime_diff' older.
1087          * All in all, the atime in Lustre does not strictly comply with
1088          * POSIX. Solving this problem needs to send an RPC to MDT for each
1089          * read, this will hurt performance. */
1090         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1091                 LTIME_S(inode->i_atime) = lli->lli_atime;
1092                 lli->lli_update_atime = 0;
1093         }
1094         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1095         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1096
1097         atime = LTIME_S(inode->i_atime);
1098         mtime = LTIME_S(inode->i_mtime);
1099         ctime = LTIME_S(inode->i_ctime);
1100
1101         cl_object_attr_lock(obj);
1102         rc = cl_object_attr_get(env, obj, attr);
1103         cl_object_attr_unlock(obj);
1104
1105         if (rc != 0)
1106                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1107
1108         if (atime < attr->cat_atime)
1109                 atime = attr->cat_atime;
1110
1111         if (ctime < attr->cat_ctime)
1112                 ctime = attr->cat_ctime;
1113
1114         if (mtime < attr->cat_mtime)
1115                 mtime = attr->cat_mtime;
1116
1117         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1118                PFID(&lli->lli_fid), attr->cat_size);
1119
1120         i_size_write(inode, attr->cat_size);
1121         inode->i_blocks = attr->cat_blocks;
1122
1123         LTIME_S(inode->i_atime) = atime;
1124         LTIME_S(inode->i_mtime) = mtime;
1125         LTIME_S(inode->i_ctime) = ctime;
1126
1127 out_size_unlock:
1128         ll_inode_size_unlock(inode);
1129
1130         RETURN(rc);
1131 }
1132
1133 /**
1134  * Set designated mirror for I/O.
1135  *
1136  * So far only read, write, and truncated can support to issue I/O to
1137  * designated mirror.
1138  */
1139 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1140 {
1141         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1142
1143         /* clear layout version for generic(non-resync) I/O in case it carries
1144          * stale layout version due to I/O restart */
1145         io->ci_layout_version = 0;
1146
1147         /* FLR: disable non-delay for designated mirror I/O because obviously
1148          * only one mirror is available */
1149         if (fd->fd_designated_mirror > 0) {
1150                 io->ci_ndelay = 0;
1151                 io->ci_designated_mirror = fd->fd_designated_mirror;
1152                 io->ci_layout_version = fd->fd_layout_version;
1153                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1154                                  * io to ptasks */
1155         }
1156
1157         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1158                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1159 }
1160
1161 static bool file_is_noatime(const struct file *file)
1162 {
1163         const struct vfsmount *mnt = file->f_path.mnt;
1164         const struct inode *inode = file_inode((struct file *)file);
1165
1166         /* Adapted from file_accessed() and touch_atime().*/
1167         if (file->f_flags & O_NOATIME)
1168                 return true;
1169
1170         if (inode->i_flags & S_NOATIME)
1171                 return true;
1172
1173         if (IS_NOATIME(inode))
1174                 return true;
1175
1176         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1177                 return true;
1178
1179         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1180                 return true;
1181
1182         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1183                 return true;
1184
1185         return false;
1186 }
1187
1188 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1189
1190 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1191 {
1192         struct inode *inode = file_inode(file);
1193         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1194
1195         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1196         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1197         io->u.ci_rw.rw_file = file;
1198         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1199         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1200         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1201
1202         if (iot == CIT_WRITE) {
1203                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1204                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1205                                            file->f_flags & O_DIRECT ||
1206                                            IS_SYNC(inode));
1207         }
1208         io->ci_obj = ll_i2info(inode)->lli_clob;
1209         io->ci_lockreq = CILR_MAYBE;
1210         if (ll_file_nolock(file)) {
1211                 io->ci_lockreq = CILR_NEVER;
1212                 io->ci_no_srvlock = 1;
1213         } else if (file->f_flags & O_APPEND) {
1214                 io->ci_lockreq = CILR_MANDATORY;
1215         }
1216         io->ci_noatime = file_is_noatime(file);
1217         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1218                 io->ci_pio = !io->u.ci_rw.rw_append;
1219         else
1220                 io->ci_pio = 0;
1221
1222         /* FLR: only use non-delay I/O for read as there is only one
1223          * avaliable mirror for write. */
1224         io->ci_ndelay = !(iot == CIT_WRITE);
1225
1226         ll_io_set_mirror(io, file);
1227 }
1228
1229 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1230 {
1231         struct cl_io_pt *pt = ptask->pt_cbdata;
1232         struct file *file = pt->cip_file;
1233         struct lu_env *env;
1234         struct cl_io *io;
1235         loff_t pos = pt->cip_pos;
1236         int rc;
1237         __u16 refcheck;
1238         ENTRY;
1239
1240         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1241                 file_dentry(file)->d_name.name,
1242                 pt->cip_iot == CIT_READ ? "read" : "write",
1243                 pos, pos + pt->cip_count);
1244
1245         env = cl_env_get(&refcheck);
1246         if (IS_ERR(env))
1247                 RETURN(PTR_ERR(env));
1248
1249         io = vvp_env_thread_io(env);
1250         ll_io_init(io, file, pt->cip_iot);
1251         io->u.ci_rw.rw_iter = pt->cip_iter;
1252         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1253         io->ci_pio = 0; /* It's already in parallel task */
1254
1255         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1256                            pt->cip_count - pt->cip_result);
1257         if (!rc) {
1258                 struct vvp_io *vio = vvp_env_io(env);
1259
1260                 vio->vui_io_subtype = IO_NORMAL;
1261                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1262
1263                 ll_cl_add(file, env, io, LCC_RW);
1264                 rc = cl_io_loop(env, io);
1265                 ll_cl_remove(file, env);
1266         } else {
1267                 /* cl_io_rw_init() handled IO */
1268                 rc = io->ci_result;
1269         }
1270
1271         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1272                 if (io->ci_nob > 0)
1273                         io->ci_nob /= 2;
1274                 rc = -EIO;
1275         }
1276
1277         if (io->ci_nob > 0) {
1278                 pt->cip_result += io->ci_nob;
1279                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1280                 pos += io->ci_nob;
1281                 pt->cip_iocb.ki_pos = pos;
1282 #ifdef HAVE_KIOCB_KI_LEFT
1283                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1284 #elif defined(HAVE_KI_NBYTES)
1285                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1286 #endif
1287         }
1288
1289         cl_io_fini(env, io);
1290         cl_env_put(env, &refcheck);
1291
1292         pt->cip_need_restart = io->ci_need_restart;
1293
1294         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1295                 file_dentry(file)->d_name.name,
1296                 pt->cip_iot == CIT_READ ? "read" : "write",
1297                 pt->cip_result, rc);
1298
1299         RETURN(pt->cip_result > 0 ? 0 : rc);
1300 }
1301
1302 static ssize_t
1303 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1304                    struct file *file, enum cl_io_type iot,
1305                    loff_t *ppos, size_t count)
1306 {
1307         struct range_lock       range;
1308         struct vvp_io           *vio = vvp_env_io(env);
1309         struct inode            *inode = file_inode(file);
1310         struct ll_inode_info    *lli = ll_i2info(inode);
1311         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1312         struct cl_io            *io;
1313         loff_t                  pos = *ppos;
1314         ssize_t                 result = 0;
1315         int                     rc = 0;
1316         unsigned                retried = 0;
1317         bool                    restarted = false;
1318
1319         ENTRY;
1320
1321         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1322                 file_dentry(file)->d_name.name,
1323                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1324
1325 restart:
1326         io = vvp_env_thread_io(env);
1327         ll_io_init(io, file, iot);
1328         if (args->via_io_subtype == IO_NORMAL) {
1329                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1330                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1331         }
1332         if (args->via_io_subtype != IO_NORMAL || restarted)
1333                 io->ci_pio = 0;
1334         io->ci_ndelay_tried = retried;
1335
1336         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1337                 bool range_locked = false;
1338
1339                 if (file->f_flags & O_APPEND)
1340                         range_lock_init(&range, 0, LUSTRE_EOF);
1341                 else
1342                         range_lock_init(&range, pos, pos + count - 1);
1343
1344                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1345                 vio->vui_io_subtype = args->via_io_subtype;
1346
1347                 switch (vio->vui_io_subtype) {
1348                 case IO_NORMAL:
1349                         /* Direct IO reads must also take range lock,
1350                          * or multiple reads will try to work on the same pages
1351                          * See LU-6227 for details. */
1352                         if (((iot == CIT_WRITE) ||
1353                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1354                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1355                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1356                                        RL_PARA(&range));
1357                                 rc = range_lock(&lli->lli_write_tree, &range);
1358                                 if (rc < 0)
1359                                         GOTO(out, rc);
1360
1361                                 range_locked = true;
1362                         }
1363                         break;
1364                 case IO_SPLICE:
1365                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1366                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1367                         break;
1368                 default:
1369                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1370                         LBUG();
1371                 }
1372
1373                 ll_cl_add(file, env, io, LCC_RW);
1374                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1375                     !lli->lli_inode_locked) {
1376                         inode_lock(inode);
1377                         lli->lli_inode_locked = 1;
1378                 }
1379                 rc = cl_io_loop(env, io);
1380                 if (lli->lli_inode_locked) {
1381                         lli->lli_inode_locked = 0;
1382                         inode_unlock(inode);
1383                 }
1384                 ll_cl_remove(file, env);
1385
1386                 if (range_locked) {
1387                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1388                                RL_PARA(&range));
1389                         range_unlock(&lli->lli_write_tree, &range);
1390                 }
1391         } else {
1392                 /* cl_io_rw_init() handled IO */
1393                 rc = io->ci_result;
1394         }
1395
1396         if (io->ci_nob > 0) {
1397                 result += io->ci_nob;
1398                 count  -= io->ci_nob;
1399
1400                 if (args->via_io_subtype == IO_NORMAL) {
1401                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1402                         pos += io->ci_nob;
1403                         args->u.normal.via_iocb->ki_pos = pos;
1404 #ifdef HAVE_KIOCB_KI_LEFT
1405                         args->u.normal.via_iocb->ki_left = count;
1406 #elif defined(HAVE_KI_NBYTES)
1407                         args->u.normal.via_iocb->ki_nbytes = count;
1408 #endif
1409                 } else {
1410                         /* for splice */
1411                         pos = io->u.ci_rw.rw_range.cir_pos;
1412                 }
1413         }
1414 out:
1415         cl_io_fini(env, io);
1416
1417         CDEBUG(D_VFSTRACE,
1418                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1419                file->f_path.dentry->d_name.name,
1420                iot, rc, result, io->ci_need_restart);
1421
1422         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1423                 CDEBUG(D_VFSTRACE,
1424                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1425                         file_dentry(file)->d_name.name,
1426                         iot == CIT_READ ? "read" : "write",
1427                         pos, pos + count, result, rc);
1428                 /* preserve the tried count for FLR */
1429                 retried = io->ci_ndelay_tried;
1430                 restarted = true;
1431                 goto restart;
1432         }
1433
1434         if (iot == CIT_READ) {
1435                 if (result > 0)
1436                         ll_stats_ops_tally(ll_i2sbi(inode),
1437                                            LPROC_LL_READ_BYTES, result);
1438         } else if (iot == CIT_WRITE) {
1439                 if (result > 0) {
1440                         ll_stats_ops_tally(ll_i2sbi(inode),
1441                                            LPROC_LL_WRITE_BYTES, result);
1442                         fd->fd_write_failed = false;
1443                 } else if (result == 0 && rc == 0) {
1444                         rc = io->ci_result;
1445                         if (rc < 0)
1446                                 fd->fd_write_failed = true;
1447                         else
1448                                 fd->fd_write_failed = false;
1449                 } else if (rc != -ERESTARTSYS) {
1450                         fd->fd_write_failed = true;
1451                 }
1452         }
1453
1454         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1455                 file_dentry(file)->d_name.name,
1456                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1457
1458         *ppos = pos;
1459
1460         RETURN(result > 0 ? result : rc);
1461 }
1462
1463 /**
1464  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1465  * especially for small I/O.
1466  *
1467  * To serve a read request, CLIO has to create and initialize a cl_io and
1468  * then request DLM lock. This has turned out to have siginificant overhead
1469  * and affects the performance of small I/O dramatically.
1470  *
1471  * It's not necessary to create a cl_io for each I/O. Under the help of read
1472  * ahead, most of the pages being read are already in memory cache and we can
1473  * read those pages directly because if the pages exist, the corresponding DLM
1474  * lock must exist so that page content must be valid.
1475  *
1476  * In fast read implementation, the llite speculatively finds and reads pages
1477  * in memory cache. There are three scenarios for fast read:
1478  *   - If the page exists and is uptodate, kernel VM will provide the data and
1479  *     CLIO won't be intervened;
1480  *   - If the page was brought into memory by read ahead, it will be exported
1481  *     and read ahead parameters will be updated;
1482  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1483  *     it will go back and invoke normal read, i.e., a cl_io will be created
1484  *     and DLM lock will be requested.
1485  *
1486  * POSIX compliance: posix standard states that read is intended to be atomic.
1487  * Lustre read implementation is in line with Linux kernel read implementation
1488  * and neither of them complies with POSIX standard in this matter. Fast read
1489  * doesn't make the situation worse on single node but it may interleave write
1490  * results from multiple nodes due to short read handling in ll_file_aio_read().
1491  *
1492  * \param env - lu_env
1493  * \param iocb - kiocb from kernel
1494  * \param iter - user space buffers where the data will be copied
1495  *
1496  * \retval - number of bytes have been read, or error code if error occurred.
1497  */
1498 static ssize_t
1499 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1500 {
1501         ssize_t result;
1502
1503         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1504                 return 0;
1505
1506         /* NB: we can't do direct IO for fast read because it will need a lock
1507          * to make IO engine happy. */
1508         if (iocb->ki_filp->f_flags & O_DIRECT)
1509                 return 0;
1510
1511         result = generic_file_read_iter(iocb, iter);
1512
1513         /* If the first page is not in cache, generic_file_aio_read() will be
1514          * returned with -ENODATA.
1515          * See corresponding code in ll_readpage(). */
1516         if (result == -ENODATA)
1517                 result = 0;
1518
1519         if (result > 0)
1520                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1521                                 LPROC_LL_READ_BYTES, result);
1522
1523         return result;
1524 }
1525
1526 /*
1527  * Read from a file (through the page cache).
1528  */
1529 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1530 {
1531         struct lu_env *env;
1532         struct vvp_io_args *args;
1533         ssize_t result;
1534         ssize_t rc2;
1535         __u16 refcheck;
1536
1537         result = ll_do_fast_read(iocb, to);
1538         if (result < 0 || iov_iter_count(to) == 0)
1539                 GOTO(out, result);
1540
1541         env = cl_env_get(&refcheck);
1542         if (IS_ERR(env))
1543                 return PTR_ERR(env);
1544
1545         args = ll_env_args(env, IO_NORMAL);
1546         args->u.normal.via_iter = to;
1547         args->u.normal.via_iocb = iocb;
1548
1549         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1550                                  &iocb->ki_pos, iov_iter_count(to));
1551         if (rc2 > 0)
1552                 result += rc2;
1553         else if (result == 0)
1554                 result = rc2;
1555
1556         cl_env_put(env, &refcheck);
1557 out:
1558         return result;
1559 }
1560
1561 /*
1562  * Write to a file (through the page cache).
1563  */
1564 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1565 {
1566         struct vvp_io_args *args;
1567         struct lu_env *env;
1568         ssize_t result;
1569         __u16 refcheck;
1570
1571         env = cl_env_get(&refcheck);
1572         if (IS_ERR(env))
1573                 return PTR_ERR(env);
1574
1575         args = ll_env_args(env, IO_NORMAL);
1576         args->u.normal.via_iter = from;
1577         args->u.normal.via_iocb = iocb;
1578
1579         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1580                                     &iocb->ki_pos, iov_iter_count(from));
1581         cl_env_put(env, &refcheck);
1582         return result;
1583 }
1584
1585 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1586 /*
1587  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1588  */
1589 static int ll_file_get_iov_count(const struct iovec *iov,
1590                                  unsigned long *nr_segs, size_t *count)
1591 {
1592         size_t cnt = 0;
1593         unsigned long seg;
1594
1595         for (seg = 0; seg < *nr_segs; seg++) {
1596                 const struct iovec *iv = &iov[seg];
1597
1598                 /*
1599                  * If any segment has a negative length, or the cumulative
1600                  * length ever wraps negative then return -EINVAL.
1601                  */
1602                 cnt += iv->iov_len;
1603                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1604                         return -EINVAL;
1605                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1606                         continue;
1607                 if (seg == 0)
1608                         return -EFAULT;
1609                 *nr_segs = seg;
1610                 cnt -= iv->iov_len;     /* This segment is no good */
1611                 break;
1612         }
1613         *count = cnt;
1614         return 0;
1615 }
1616
1617 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1618                                 unsigned long nr_segs, loff_t pos)
1619 {
1620         struct iov_iter to;
1621         size_t iov_count;
1622         ssize_t result;
1623         ENTRY;
1624
1625         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1626         if (result)
1627                 RETURN(result);
1628
1629 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1630         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1631 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1632         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1633 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1634
1635         result = ll_file_read_iter(iocb, &to);
1636
1637         RETURN(result);
1638 }
1639
1640 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1641                             loff_t *ppos)
1642 {
1643         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1644         struct kiocb   kiocb;
1645         ssize_t        result;
1646         ENTRY;
1647
1648         init_sync_kiocb(&kiocb, file);
1649         kiocb.ki_pos = *ppos;
1650 #ifdef HAVE_KIOCB_KI_LEFT
1651         kiocb.ki_left = count;
1652 #elif defined(HAVE_KI_NBYTES)
1653         kiocb.i_nbytes = count;
1654 #endif
1655
1656         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1657         *ppos = kiocb.ki_pos;
1658
1659         RETURN(result);
1660 }
1661
1662 /*
1663  * Write to a file (through the page cache).
1664  * AIO stuff
1665  */
1666 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1667                                  unsigned long nr_segs, loff_t pos)
1668 {
1669         struct iov_iter from;
1670         size_t iov_count;
1671         ssize_t result;
1672         ENTRY;
1673
1674         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1675         if (result)
1676                 RETURN(result);
1677
1678 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1679         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1680 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1681         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1682 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1683
1684         result = ll_file_write_iter(iocb, &from);
1685
1686         RETURN(result);
1687 }
1688
1689 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1690                              size_t count, loff_t *ppos)
1691 {
1692         struct lu_env *env;
1693         struct iovec   iov = { .iov_base = (void __user *)buf,
1694                                .iov_len = count };
1695         struct kiocb  *kiocb;
1696         ssize_t        result;
1697         __u16          refcheck;
1698         ENTRY;
1699
1700         env = cl_env_get(&refcheck);
1701         if (IS_ERR(env))
1702                 RETURN(PTR_ERR(env));
1703
1704         kiocb = &ll_env_info(env)->lti_kiocb;
1705         init_sync_kiocb(kiocb, file);
1706         kiocb->ki_pos = *ppos;
1707 #ifdef HAVE_KIOCB_KI_LEFT
1708         kiocb->ki_left = count;
1709 #elif defined(HAVE_KI_NBYTES)
1710         kiocb->ki_nbytes = count;
1711 #endif
1712
1713         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1714         *ppos = kiocb->ki_pos;
1715
1716         cl_env_put(env, &refcheck);
1717         RETURN(result);
1718 }
1719 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1720
1721 /*
1722  * Send file content (through pagecache) somewhere with helper
1723  */
1724 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1725                                    struct pipe_inode_info *pipe, size_t count,
1726                                    unsigned int flags)
1727 {
1728         struct lu_env      *env;
1729         struct vvp_io_args *args;
1730         ssize_t             result;
1731         __u16               refcheck;
1732         ENTRY;
1733
1734         env = cl_env_get(&refcheck);
1735         if (IS_ERR(env))
1736                 RETURN(PTR_ERR(env));
1737
1738         args = ll_env_args(env, IO_SPLICE);
1739         args->u.splice.via_pipe = pipe;
1740         args->u.splice.via_flags = flags;
1741
1742         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1743         cl_env_put(env, &refcheck);
1744         RETURN(result);
1745 }
1746
1747 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1748                              __u64 flags, struct lov_user_md *lum, int lum_size)
1749 {
1750         struct lookup_intent oit = {
1751                 .it_op = IT_OPEN,
1752                 .it_flags = flags | MDS_OPEN_BY_FID,
1753         };
1754         int rc;
1755         ENTRY;
1756
1757         ll_inode_size_lock(inode);
1758         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1759         if (rc < 0)
1760                 GOTO(out_unlock, rc);
1761
1762         ll_release_openhandle(dentry, &oit);
1763
1764 out_unlock:
1765         ll_inode_size_unlock(inode);
1766         ll_intent_release(&oit);
1767
1768         RETURN(rc);
1769 }
1770
1771 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1772                              struct lov_mds_md **lmmp, int *lmm_size,
1773                              struct ptlrpc_request **request)
1774 {
1775         struct ll_sb_info *sbi = ll_i2sbi(inode);
1776         struct mdt_body  *body;
1777         struct lov_mds_md *lmm = NULL;
1778         struct ptlrpc_request *req = NULL;
1779         struct md_op_data *op_data;
1780         int rc, lmmsize;
1781
1782         rc = ll_get_default_mdsize(sbi, &lmmsize);
1783         if (rc)
1784                 RETURN(rc);
1785
1786         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1787                                      strlen(filename), lmmsize,
1788                                      LUSTRE_OPC_ANY, NULL);
1789         if (IS_ERR(op_data))
1790                 RETURN(PTR_ERR(op_data));
1791
1792         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1793         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1794         ll_finish_md_op_data(op_data);
1795         if (rc < 0) {
1796                 CDEBUG(D_INFO, "md_getattr_name failed "
1797                        "on %s: rc %d\n", filename, rc);
1798                 GOTO(out, rc);
1799         }
1800
1801         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1802         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1803
1804         lmmsize = body->mbo_eadatasize;
1805
1806         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1807                         lmmsize == 0) {
1808                 GOTO(out, rc = -ENODATA);
1809         }
1810
1811         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1812         LASSERT(lmm != NULL);
1813
1814         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1815             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1816             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1817                 GOTO(out, rc = -EPROTO);
1818
1819         /*
1820          * This is coming from the MDS, so is probably in
1821          * little endian.  We convert it to host endian before
1822          * passing it to userspace.
1823          */
1824         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1825                 int stripe_count;
1826
1827                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1828                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1829                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1830                         if (le32_to_cpu(lmm->lmm_pattern) &
1831                             LOV_PATTERN_F_RELEASED)
1832                                 stripe_count = 0;
1833                 }
1834
1835                 /* if function called for directory - we should
1836                  * avoid swab not existent lsm objects */
1837                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1838                         lustre_swab_lov_user_md_v1(
1839                                         (struct lov_user_md_v1 *)lmm);
1840                         if (S_ISREG(body->mbo_mode))
1841                                 lustre_swab_lov_user_md_objects(
1842                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1843                                     stripe_count);
1844                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1845                         lustre_swab_lov_user_md_v3(
1846                                         (struct lov_user_md_v3 *)lmm);
1847                         if (S_ISREG(body->mbo_mode))
1848                                 lustre_swab_lov_user_md_objects(
1849                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1850                                     stripe_count);
1851                 } else if (lmm->lmm_magic ==
1852                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1853                         lustre_swab_lov_comp_md_v1(
1854                                         (struct lov_comp_md_v1 *)lmm);
1855                 }
1856         }
1857
1858 out:
1859         *lmmp = lmm;
1860         *lmm_size = lmmsize;
1861         *request = req;
1862         return rc;
1863 }
1864
1865 static int ll_lov_setea(struct inode *inode, struct file *file,
1866                         void __user *arg)
1867 {
1868         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1869         struct lov_user_md      *lump;
1870         int                      lum_size = sizeof(struct lov_user_md) +
1871                                             sizeof(struct lov_user_ost_data);
1872         int                      rc;
1873         ENTRY;
1874
1875         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1876                 RETURN(-EPERM);
1877
1878         OBD_ALLOC_LARGE(lump, lum_size);
1879         if (lump == NULL)
1880                 RETURN(-ENOMEM);
1881
1882         if (copy_from_user(lump, arg, lum_size))
1883                 GOTO(out_lump, rc = -EFAULT);
1884
1885         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1886                                       lum_size);
1887         cl_lov_delay_create_clear(&file->f_flags);
1888
1889 out_lump:
1890         OBD_FREE_LARGE(lump, lum_size);
1891         RETURN(rc);
1892 }
1893
1894 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1895 {
1896         struct lu_env   *env;
1897         __u16           refcheck;
1898         int             rc;
1899         ENTRY;
1900
1901         env = cl_env_get(&refcheck);
1902         if (IS_ERR(env))
1903                 RETURN(PTR_ERR(env));
1904
1905         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1906         cl_env_put(env, &refcheck);
1907         RETURN(rc);
1908 }
1909
1910 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1911                             void __user *arg)
1912 {
1913         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1914         struct lov_user_md        *klum;
1915         int                        lum_size, rc;
1916         __u64                      flags = FMODE_WRITE;
1917         ENTRY;
1918
1919         rc = ll_copy_user_md(lum, &klum);
1920         if (rc < 0)
1921                 RETURN(rc);
1922
1923         lum_size = rc;
1924         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1925                                       lum_size);
1926         if (!rc) {
1927                 __u32 gen;
1928
1929                 rc = put_user(0, &lum->lmm_stripe_count);
1930                 if (rc)
1931                         GOTO(out, rc);
1932
1933                 rc = ll_layout_refresh(inode, &gen);
1934                 if (rc)
1935                         GOTO(out, rc);
1936
1937                 rc = ll_file_getstripe(inode, arg, lum_size);
1938         }
1939         cl_lov_delay_create_clear(&file->f_flags);
1940
1941 out:
1942         OBD_FREE(klum, lum_size);
1943         RETURN(rc);
1944 }
1945
1946 static int
1947 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1948 {
1949         struct ll_inode_info *lli = ll_i2info(inode);
1950         struct cl_object *obj = lli->lli_clob;
1951         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1952         struct ll_grouplock grouplock;
1953         int rc;
1954         ENTRY;
1955
1956         if (arg == 0) {
1957                 CWARN("group id for group lock must not be 0\n");
1958                 RETURN(-EINVAL);
1959         }
1960
1961         if (ll_file_nolock(file))
1962                 RETURN(-EOPNOTSUPP);
1963
1964         spin_lock(&lli->lli_lock);
1965         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1966                 CWARN("group lock already existed with gid %lu\n",
1967                       fd->fd_grouplock.lg_gid);
1968                 spin_unlock(&lli->lli_lock);
1969                 RETURN(-EINVAL);
1970         }
1971         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1972         spin_unlock(&lli->lli_lock);
1973
1974         /**
1975          * XXX: group lock needs to protect all OST objects while PFL
1976          * can add new OST objects during the IO, so we'd instantiate
1977          * all OST objects before getting its group lock.
1978          */
1979         if (obj) {
1980                 struct lu_env *env;
1981                 __u16 refcheck;
1982                 struct cl_layout cl = {
1983                         .cl_is_composite = false,
1984                 };
1985                 struct lu_extent ext = {
1986                         .e_start = 0,
1987                         .e_end = OBD_OBJECT_EOF,
1988                 };
1989
1990                 env = cl_env_get(&refcheck);
1991                 if (IS_ERR(env))
1992                         RETURN(PTR_ERR(env));
1993
1994                 rc = cl_object_layout_get(env, obj, &cl);
1995                 if (!rc && cl.cl_is_composite)
1996                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
1997                                                     &ext);
1998
1999                 cl_env_put(env, &refcheck);
2000                 if (rc)
2001                         RETURN(rc);
2002         }
2003
2004         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2005                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2006         if (rc)
2007                 RETURN(rc);
2008
2009         spin_lock(&lli->lli_lock);
2010         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2011                 spin_unlock(&lli->lli_lock);
2012                 CERROR("another thread just won the race\n");
2013                 cl_put_grouplock(&grouplock);
2014                 RETURN(-EINVAL);
2015         }
2016
2017         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2018         fd->fd_grouplock = grouplock;
2019         spin_unlock(&lli->lli_lock);
2020
2021         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2022         RETURN(0);
2023 }
2024
2025 static int ll_put_grouplock(struct inode *inode, struct file *file,
2026                             unsigned long arg)
2027 {
2028         struct ll_inode_info   *lli = ll_i2info(inode);
2029         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2030         struct ll_grouplock     grouplock;
2031         ENTRY;
2032
2033         spin_lock(&lli->lli_lock);
2034         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2035                 spin_unlock(&lli->lli_lock);
2036                 CWARN("no group lock held\n");
2037                 RETURN(-EINVAL);
2038         }
2039
2040         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2041
2042         if (fd->fd_grouplock.lg_gid != arg) {
2043                 CWARN("group lock %lu doesn't match current id %lu\n",
2044                       arg, fd->fd_grouplock.lg_gid);
2045                 spin_unlock(&lli->lli_lock);
2046                 RETURN(-EINVAL);
2047         }
2048
2049         grouplock = fd->fd_grouplock;
2050         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2051         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2052         spin_unlock(&lli->lli_lock);
2053
2054         cl_put_grouplock(&grouplock);
2055         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2056         RETURN(0);
2057 }
2058
2059 /**
2060  * Close inode open handle
2061  *
2062  * \param dentry [in]     dentry which contains the inode
2063  * \param it     [in,out] intent which contains open info and result
2064  *
2065  * \retval 0     success
2066  * \retval <0    failure
2067  */
2068 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2069 {
2070         struct inode *inode = dentry->d_inode;
2071         struct obd_client_handle *och;
2072         int rc;
2073         ENTRY;
2074
2075         LASSERT(inode);
2076
2077         /* Root ? Do nothing. */
2078         if (dentry->d_inode->i_sb->s_root == dentry)
2079                 RETURN(0);
2080
2081         /* No open handle to close? Move away */
2082         if (!it_disposition(it, DISP_OPEN_OPEN))
2083                 RETURN(0);
2084
2085         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2086
2087         OBD_ALLOC(och, sizeof(*och));
2088         if (!och)
2089                 GOTO(out, rc = -ENOMEM);
2090
2091         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2092
2093         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2094 out:
2095         /* this one is in place of ll_file_open */
2096         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2097                 ptlrpc_req_finished(it->it_request);
2098                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2099         }
2100         RETURN(rc);
2101 }
2102
2103 /**
2104  * Get size for inode for which FIEMAP mapping is requested.
2105  * Make the FIEMAP get_info call and returns the result.
2106  * \param fiemap        kernel buffer to hold extens
2107  * \param num_bytes     kernel buffer size
2108  */
2109 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2110                         size_t num_bytes)
2111 {
2112         struct lu_env                   *env;
2113         __u16                           refcheck;
2114         int                             rc = 0;
2115         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2116         ENTRY;
2117
2118         /* Checks for fiemap flags */
2119         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2120                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2121                 return -EBADR;
2122         }
2123
2124         /* Check for FIEMAP_FLAG_SYNC */
2125         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2126                 rc = filemap_fdatawrite(inode->i_mapping);
2127                 if (rc)
2128                         return rc;
2129         }
2130
2131         env = cl_env_get(&refcheck);
2132         if (IS_ERR(env))
2133                 RETURN(PTR_ERR(env));
2134
2135         if (i_size_read(inode) == 0) {
2136                 rc = ll_glimpse_size(inode);
2137                 if (rc)
2138                         GOTO(out, rc);
2139         }
2140
2141         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2142         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2143         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2144
2145         /* If filesize is 0, then there would be no objects for mapping */
2146         if (fmkey.lfik_oa.o_size == 0) {
2147                 fiemap->fm_mapped_extents = 0;
2148                 GOTO(out, rc = 0);
2149         }
2150
2151         fmkey.lfik_fiemap = *fiemap;
2152
2153         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2154                               &fmkey, fiemap, &num_bytes);
2155 out:
2156         cl_env_put(env, &refcheck);
2157         RETURN(rc);
2158 }
2159
2160 int ll_fid2path(struct inode *inode, void __user *arg)
2161 {
2162         struct obd_export       *exp = ll_i2mdexp(inode);
2163         const struct getinfo_fid2path __user *gfin = arg;
2164         __u32                    pathlen;
2165         struct getinfo_fid2path *gfout;
2166         size_t                   outsize;
2167         int                      rc;
2168
2169         ENTRY;
2170
2171         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2172             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2173                 RETURN(-EPERM);
2174
2175         /* Only need to get the buflen */
2176         if (get_user(pathlen, &gfin->gf_pathlen))
2177                 RETURN(-EFAULT);
2178
2179         if (pathlen > PATH_MAX)
2180                 RETURN(-EINVAL);
2181
2182         outsize = sizeof(*gfout) + pathlen;
2183         OBD_ALLOC(gfout, outsize);
2184         if (gfout == NULL)
2185                 RETURN(-ENOMEM);
2186
2187         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2188                 GOTO(gf_free, rc = -EFAULT);
2189         /* append root FID after gfout to let MDT know the root FID so that it
2190          * can lookup the correct path, this is mainly for fileset.
2191          * old server without fileset mount support will ignore this. */
2192         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2193
2194         /* Call mdc_iocontrol */
2195         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2196         if (rc != 0)
2197                 GOTO(gf_free, rc);
2198
2199         if (copy_to_user(arg, gfout, outsize))
2200                 rc = -EFAULT;
2201
2202 gf_free:
2203         OBD_FREE(gfout, outsize);
2204         RETURN(rc);
2205 }
2206
2207 static int
2208 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2209 {
2210         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2211         struct lu_env *env;
2212         struct cl_io *io;
2213         __u16  refcheck;
2214         int result;
2215
2216         ENTRY;
2217
2218         ioc->idv_version = 0;
2219         ioc->idv_layout_version = UINT_MAX;
2220
2221         /* If no file object initialized, we consider its version is 0. */
2222         if (obj == NULL)
2223                 RETURN(0);
2224
2225         env = cl_env_get(&refcheck);
2226         if (IS_ERR(env))
2227                 RETURN(PTR_ERR(env));
2228
2229         io = vvp_env_thread_io(env);
2230         io->ci_obj = obj;
2231         io->u.ci_data_version.dv_data_version = 0;
2232         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2233         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2234
2235 restart:
2236         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2237                 result = cl_io_loop(env, io);
2238         else
2239                 result = io->ci_result;
2240
2241         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2242         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2243
2244         cl_io_fini(env, io);
2245
2246         if (unlikely(io->ci_need_restart))
2247                 goto restart;
2248
2249         cl_env_put(env, &refcheck);
2250
2251         RETURN(result);
2252 }
2253
2254 /*
2255  * Read the data_version for inode.
2256  *
2257  * This value is computed using stripe object version on OST.
2258  * Version is computed using server side locking.
2259  *
2260  * @param flags if do sync on the OST side;
2261  *              0: no sync
2262  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2263  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2264  */
2265 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2266 {
2267         struct ioc_data_version ioc = { .idv_flags = flags };
2268         int rc;
2269
2270         rc = ll_ioc_data_version(inode, &ioc);
2271         if (!rc)
2272                 *data_version = ioc.idv_version;
2273
2274         return rc;
2275 }
2276
2277 /*
2278  * Trigger a HSM release request for the provided inode.
2279  */
2280 int ll_hsm_release(struct inode *inode)
2281 {
2282         struct lu_env *env;
2283         struct obd_client_handle *och = NULL;
2284         __u64 data_version = 0;
2285         int rc;
2286         __u16 refcheck;
2287         ENTRY;
2288
2289         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2290                ll_get_fsname(inode->i_sb, NULL, 0),
2291                PFID(&ll_i2info(inode)->lli_fid));
2292
2293         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2294         if (IS_ERR(och))
2295                 GOTO(out, rc = PTR_ERR(och));
2296
2297         /* Grab latest data_version and [am]time values */
2298         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2299         if (rc != 0)
2300                 GOTO(out, rc);
2301
2302         env = cl_env_get(&refcheck);
2303         if (IS_ERR(env))
2304                 GOTO(out, rc = PTR_ERR(env));
2305
2306         ll_merge_attr(env, inode);
2307         cl_env_put(env, &refcheck);
2308
2309         /* Release the file.
2310          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2311          * we still need it to pack l_remote_handle to MDT. */
2312         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2313                                        &data_version);
2314         och = NULL;
2315
2316         EXIT;
2317 out:
2318         if (och != NULL && !IS_ERR(och)) /* close the file */
2319                 ll_lease_close(och, inode, NULL);
2320
2321         return rc;
2322 }
2323
2324 struct ll_swap_stack {
2325         __u64                    dv1;
2326         __u64                    dv2;
2327         struct inode            *inode1;
2328         struct inode            *inode2;
2329         bool                     check_dv1;
2330         bool                     check_dv2;
2331 };
2332
2333 static int ll_swap_layouts(struct file *file1, struct file *file2,
2334                            struct lustre_swap_layouts *lsl)
2335 {
2336         struct mdc_swap_layouts  msl;
2337         struct md_op_data       *op_data;
2338         __u32                    gid;
2339         __u64                    dv;
2340         struct ll_swap_stack    *llss = NULL;
2341         int                      rc;
2342
2343         OBD_ALLOC_PTR(llss);
2344         if (llss == NULL)
2345                 RETURN(-ENOMEM);
2346
2347         llss->inode1 = file_inode(file1);
2348         llss->inode2 = file_inode(file2);
2349
2350         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2351         if (rc < 0)
2352                 GOTO(free, rc);
2353
2354         /* we use 2 bool because it is easier to swap than 2 bits */
2355         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2356                 llss->check_dv1 = true;
2357
2358         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2359                 llss->check_dv2 = true;
2360
2361         /* we cannot use lsl->sl_dvX directly because we may swap them */
2362         llss->dv1 = lsl->sl_dv1;
2363         llss->dv2 = lsl->sl_dv2;
2364
2365         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2366         if (rc == 0) /* same file, done! */
2367                 GOTO(free, rc);
2368
2369         if (rc < 0) { /* sequentialize it */
2370                 swap(llss->inode1, llss->inode2);
2371                 swap(file1, file2);
2372                 swap(llss->dv1, llss->dv2);
2373                 swap(llss->check_dv1, llss->check_dv2);
2374         }
2375
2376         gid = lsl->sl_gid;
2377         if (gid != 0) { /* application asks to flush dirty cache */
2378                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2379                 if (rc < 0)
2380                         GOTO(free, rc);
2381
2382                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2383                 if (rc < 0) {
2384                         ll_put_grouplock(llss->inode1, file1, gid);
2385                         GOTO(free, rc);
2386                 }
2387         }
2388
2389         /* ultimate check, before swaping the layouts we check if
2390          * dataversion has changed (if requested) */
2391         if (llss->check_dv1) {
2392                 rc = ll_data_version(llss->inode1, &dv, 0);
2393                 if (rc)
2394                         GOTO(putgl, rc);
2395                 if (dv != llss->dv1)
2396                         GOTO(putgl, rc = -EAGAIN);
2397         }
2398
2399         if (llss->check_dv2) {
2400                 rc = ll_data_version(llss->inode2, &dv, 0);
2401                 if (rc)
2402                         GOTO(putgl, rc);
2403                 if (dv != llss->dv2)
2404                         GOTO(putgl, rc = -EAGAIN);
2405         }
2406
2407         /* struct md_op_data is used to send the swap args to the mdt
2408          * only flags is missing, so we use struct mdc_swap_layouts
2409          * through the md_op_data->op_data */
2410         /* flags from user space have to be converted before they are send to
2411          * server, no flag is sent today, they are only used on the client */
2412         msl.msl_flags = 0;
2413         rc = -ENOMEM;
2414         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2415                                      0, LUSTRE_OPC_ANY, &msl);
2416         if (IS_ERR(op_data))
2417                 GOTO(free, rc = PTR_ERR(op_data));
2418
2419         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2420                            sizeof(*op_data), op_data, NULL);
2421         ll_finish_md_op_data(op_data);
2422
2423         if (rc < 0)
2424                 GOTO(putgl, rc);
2425
2426 putgl:
2427         if (gid != 0) {
2428                 ll_put_grouplock(llss->inode2, file2, gid);
2429                 ll_put_grouplock(llss->inode1, file1, gid);
2430         }
2431
2432 free:
2433         if (llss != NULL)
2434                 OBD_FREE_PTR(llss);
2435
2436         RETURN(rc);
2437 }
2438
2439 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2440 {
2441         struct md_op_data       *op_data;
2442         int                      rc;
2443         ENTRY;
2444
2445         /* Detect out-of range masks */
2446         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2447                 RETURN(-EINVAL);
2448
2449         /* Non-root users are forbidden to set or clear flags which are
2450          * NOT defined in HSM_USER_MASK. */
2451         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2452             !cfs_capable(CFS_CAP_SYS_ADMIN))
2453                 RETURN(-EPERM);
2454
2455         /* Detect out-of range archive id */
2456         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2457             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2458                 RETURN(-EINVAL);
2459
2460         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2461                                      LUSTRE_OPC_ANY, hss);
2462         if (IS_ERR(op_data))
2463                 RETURN(PTR_ERR(op_data));
2464
2465         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2466                            sizeof(*op_data), op_data, NULL);
2467
2468         ll_finish_md_op_data(op_data);
2469
2470         RETURN(rc);
2471 }
2472
2473 static int ll_hsm_import(struct inode *inode, struct file *file,
2474                          struct hsm_user_import *hui)
2475 {
2476         struct hsm_state_set    *hss = NULL;
2477         struct iattr            *attr = NULL;
2478         int                      rc;
2479         ENTRY;
2480
2481         if (!S_ISREG(inode->i_mode))
2482                 RETURN(-EINVAL);
2483
2484         /* set HSM flags */
2485         OBD_ALLOC_PTR(hss);
2486         if (hss == NULL)
2487                 GOTO(out, rc = -ENOMEM);
2488
2489         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2490         hss->hss_archive_id = hui->hui_archive_id;
2491         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2492         rc = ll_hsm_state_set(inode, hss);
2493         if (rc != 0)
2494                 GOTO(out, rc);
2495
2496         OBD_ALLOC_PTR(attr);
2497         if (attr == NULL)
2498                 GOTO(out, rc = -ENOMEM);
2499
2500         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2501         attr->ia_mode |= S_IFREG;
2502         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2503         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2504         attr->ia_size = hui->hui_size;
2505         attr->ia_mtime.tv_sec = hui->hui_mtime;
2506         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2507         attr->ia_atime.tv_sec = hui->hui_atime;
2508         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2509
2510         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2511                          ATTR_UID | ATTR_GID |
2512                          ATTR_MTIME | ATTR_MTIME_SET |
2513                          ATTR_ATIME | ATTR_ATIME_SET;
2514
2515         inode_lock(inode);
2516
2517         rc = ll_setattr_raw(file_dentry(file), attr, true);
2518         if (rc == -ENODATA)
2519                 rc = 0;
2520
2521         inode_unlock(inode);
2522
2523 out:
2524         if (hss != NULL)
2525                 OBD_FREE_PTR(hss);
2526
2527         if (attr != NULL)
2528                 OBD_FREE_PTR(attr);
2529
2530         RETURN(rc);
2531 }
2532
2533 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2534 {
2535         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2536                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2537 }
2538
2539 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2540 {
2541         struct inode *inode = file_inode(file);
2542         struct iattr ia = {
2543                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2544                             ATTR_MTIME | ATTR_MTIME_SET |
2545                             ATTR_CTIME | ATTR_CTIME_SET,
2546                 .ia_atime = {
2547                         .tv_sec = lfu->lfu_atime_sec,
2548                         .tv_nsec = lfu->lfu_atime_nsec,
2549                 },
2550                 .ia_mtime = {
2551                         .tv_sec = lfu->lfu_mtime_sec,
2552                         .tv_nsec = lfu->lfu_mtime_nsec,
2553                 },
2554                 .ia_ctime = {
2555                         .tv_sec = lfu->lfu_ctime_sec,
2556                         .tv_nsec = lfu->lfu_ctime_nsec,
2557                 },
2558         };
2559         int rc;
2560         ENTRY;
2561
2562         if (!capable(CAP_SYS_ADMIN))
2563                 RETURN(-EPERM);
2564
2565         if (!S_ISREG(inode->i_mode))
2566                 RETURN(-EINVAL);
2567
2568         inode_lock(inode);
2569         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2570         inode_unlock(inode);
2571
2572         RETURN(rc);
2573 }
2574
2575 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2576 {
2577         switch (mode) {
2578         case MODE_READ_USER:
2579                 return CLM_READ;
2580         case MODE_WRITE_USER:
2581                 return CLM_WRITE;
2582         default:
2583                 return -EINVAL;
2584         }
2585 }
2586
2587 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2588
2589 /* Used to allow the upper layers of the client to request an LDLM lock
2590  * without doing an actual read or write.
2591  *
2592  * Used for ladvise lockahead to manually request specific locks.
2593  *
2594  * \param[in] file      file this ladvise lock request is on
2595  * \param[in] ladvise   ladvise struct describing this lock request
2596  *
2597  * \retval 0            success, no detailed result available (sync requests
2598  *                      and requests sent to the server [not handled locally]
2599  *                      cannot return detailed results)
2600  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2601  *                                       see definitions for details.
2602  * \retval negative     negative errno on error
2603  */
2604 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2605 {
2606         struct lu_env *env = NULL;
2607         struct cl_io *io  = NULL;
2608         struct cl_lock *lock = NULL;
2609         struct cl_lock_descr *descr = NULL;
2610         struct dentry *dentry = file->f_path.dentry;
2611         struct inode *inode = dentry->d_inode;
2612         enum cl_lock_mode cl_mode;
2613         off_t start = ladvise->lla_start;
2614         off_t end = ladvise->lla_end;
2615         int result;
2616         __u16 refcheck;
2617
2618         ENTRY;
2619
2620         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2621                "start=%llu, end=%llu\n", dentry->d_name.len,
2622                dentry->d_name.name, dentry->d_inode,
2623                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2624                (__u64) end);
2625
2626         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2627         if (cl_mode < 0)
2628                 GOTO(out, result = cl_mode);
2629
2630         /* Get IO environment */
2631         result = cl_io_get(inode, &env, &io, &refcheck);
2632         if (result <= 0)
2633                 GOTO(out, result);
2634
2635         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2636         if (result > 0) {
2637                 /*
2638                  * nothing to do for this io. This currently happens when
2639                  * stripe sub-object's are not yet created.
2640                  */
2641                 result = io->ci_result;
2642         } else if (result == 0) {
2643                 lock = vvp_env_lock(env);
2644                 descr = &lock->cll_descr;
2645
2646                 descr->cld_obj   = io->ci_obj;
2647                 /* Convert byte offsets to pages */
2648                 descr->cld_start = cl_index(io->ci_obj, start);
2649                 descr->cld_end   = cl_index(io->ci_obj, end);
2650                 descr->cld_mode  = cl_mode;
2651                 /* CEF_MUST is used because we do not want to convert a
2652                  * lockahead request to a lockless lock */
2653                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2654                                        CEF_NONBLOCK;
2655
2656                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2657                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2658
2659                 result = cl_lock_request(env, io, lock);
2660
2661                 /* On success, we need to release the lock */
2662                 if (result >= 0)
2663                         cl_lock_release(env, lock);
2664         }
2665         cl_io_fini(env, io);
2666         cl_env_put(env, &refcheck);
2667
2668         /* -ECANCELED indicates a matching lock with a different extent
2669          * was already present, and -EEXIST indicates a matching lock
2670          * on exactly the same extent was already present.
2671          * We convert them to positive values for userspace to make
2672          * recognizing true errors easier.
2673          * Note we can only return these detailed results on async requests,
2674          * as sync requests look the same as i/o requests for locking. */
2675         if (result == -ECANCELED)
2676                 result = LLA_RESULT_DIFFERENT;
2677         else if (result == -EEXIST)
2678                 result = LLA_RESULT_SAME;
2679
2680 out:
2681         RETURN(result);
2682 }
2683 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2684
2685 static int ll_ladvise_sanity(struct inode *inode,
2686                              struct llapi_lu_ladvise *ladvise)
2687 {
2688         enum lu_ladvise_type advice = ladvise->lla_advice;
2689         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2690          * be in the first 32 bits of enum ladvise_flags */
2691         __u32 flags = ladvise->lla_peradvice_flags;
2692         /* 3 lines at 80 characters per line, should be plenty */
2693         int rc = 0;
2694
2695         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2696                 rc = -EINVAL;
2697                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2698                        "last supported advice is %s (value '%d'): rc = %d\n",
2699                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2700                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2701                 GOTO(out, rc);
2702         }
2703
2704         /* Per-advice checks */
2705         switch (advice) {
2706         case LU_LADVISE_LOCKNOEXPAND:
2707                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2708                         rc = -EINVAL;
2709                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2710                                "rc = %d\n",
2711                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2712                                ladvise_names[advice], rc);
2713                         GOTO(out, rc);
2714                 }
2715                 break;
2716         case LU_LADVISE_LOCKAHEAD:
2717                 /* Currently only READ and WRITE modes can be requested */
2718                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2719                     ladvise->lla_lockahead_mode == 0) {
2720                         rc = -EINVAL;
2721                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2722                                "rc = %d\n",
2723                                ll_get_fsname(inode->i_sb, NULL, 0),
2724                                ladvise->lla_lockahead_mode,
2725                                ladvise_names[advice], rc);
2726                         GOTO(out, rc);
2727                 }
2728         case LU_LADVISE_WILLREAD:
2729         case LU_LADVISE_DONTNEED:
2730         default:
2731                 /* Note fall through above - These checks apply to all advices
2732                  * except LOCKNOEXPAND */
2733                 if (flags & ~LF_DEFAULT_MASK) {
2734                         rc = -EINVAL;
2735                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2736                                "rc = %d\n",
2737                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2738                                ladvise_names[advice], rc);
2739                         GOTO(out, rc);
2740                 }
2741                 if (ladvise->lla_start >= ladvise->lla_end) {
2742                         rc = -EINVAL;
2743                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2744                                "for %s: rc = %d\n",
2745                                ll_get_fsname(inode->i_sb, NULL, 0),
2746                                ladvise->lla_start, ladvise->lla_end,
2747                                ladvise_names[advice], rc);
2748                         GOTO(out, rc);
2749                 }
2750                 break;
2751         }
2752
2753 out:
2754         return rc;
2755 }
2756 #undef ERRSIZE
2757
2758 /*
2759  * Give file access advices
2760  *
2761  * The ladvise interface is similar to Linux fadvise() system call, except it
2762  * forwards the advices directly from Lustre client to server. The server side
2763  * codes will apply appropriate read-ahead and caching techniques for the
2764  * corresponding files.
2765  *
2766  * A typical workload for ladvise is e.g. a bunch of different clients are
2767  * doing small random reads of a file, so prefetching pages into OSS cache
2768  * with big linear reads before the random IO is a net benefit. Fetching
2769  * all that data into each client cache with fadvise() may not be, due to
2770  * much more data being sent to the client.
2771  */
2772 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2773                       struct llapi_lu_ladvise *ladvise)
2774 {
2775         struct lu_env *env;
2776         struct cl_io *io;
2777         struct cl_ladvise_io *lio;
2778         int rc;
2779         __u16 refcheck;
2780         ENTRY;
2781
2782         env = cl_env_get(&refcheck);
2783         if (IS_ERR(env))
2784                 RETURN(PTR_ERR(env));
2785
2786         io = vvp_env_thread_io(env);
2787         io->ci_obj = ll_i2info(inode)->lli_clob;
2788
2789         /* initialize parameters for ladvise */
2790         lio = &io->u.ci_ladvise;
2791         lio->li_start = ladvise->lla_start;
2792         lio->li_end = ladvise->lla_end;
2793         lio->li_fid = ll_inode2fid(inode);
2794         lio->li_advice = ladvise->lla_advice;
2795         lio->li_flags = flags;
2796
2797         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2798                 rc = cl_io_loop(env, io);
2799         else
2800                 rc = io->ci_result;
2801
2802         cl_io_fini(env, io);
2803         cl_env_put(env, &refcheck);
2804         RETURN(rc);
2805 }
2806
2807 static int ll_lock_noexpand(struct file *file, int flags)
2808 {
2809         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2810
2811         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2812
2813         return 0;
2814 }
2815
2816 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2817                         unsigned long arg)
2818 {
2819         struct fsxattr fsxattr;
2820
2821         if (copy_from_user(&fsxattr,
2822                            (const struct fsxattr __user *)arg,
2823                            sizeof(fsxattr)))
2824                 RETURN(-EFAULT);
2825
2826         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2827         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2828         if (copy_to_user((struct fsxattr __user *)arg,
2829                          &fsxattr, sizeof(fsxattr)))
2830                 RETURN(-EFAULT);
2831
2832         RETURN(0);
2833 }
2834
2835 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2836                         unsigned long arg)
2837 {
2838
2839         struct md_op_data *op_data;
2840         struct ptlrpc_request *req = NULL;
2841         int rc = 0;
2842         struct fsxattr fsxattr;
2843         struct cl_object *obj;
2844
2845         /* only root could change project ID */
2846         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2847                 RETURN(-EPERM);
2848
2849         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2850                                      LUSTRE_OPC_ANY, NULL);
2851         if (IS_ERR(op_data))
2852                 RETURN(PTR_ERR(op_data));
2853
2854         if (copy_from_user(&fsxattr,
2855                            (const struct fsxattr __user *)arg,
2856                            sizeof(fsxattr)))
2857                 GOTO(out_fsxattr1, rc = -EFAULT);
2858
2859         op_data->op_attr_flags = fsxattr.fsx_xflags;
2860         op_data->op_projid = fsxattr.fsx_projid;
2861         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2862         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2863                         0, &req);
2864         ptlrpc_req_finished(req);
2865
2866         obj = ll_i2info(inode)->lli_clob;
2867         if (obj) {
2868                 struct iattr *attr;
2869
2870                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2871                 OBD_ALLOC_PTR(attr);
2872                 if (attr == NULL)
2873                         GOTO(out_fsxattr1, rc = -ENOMEM);
2874                 attr->ia_valid = ATTR_ATTR_FLAG;
2875                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2876
2877                 OBD_FREE_PTR(attr);
2878         }
2879 out_fsxattr1:
2880         ll_finish_md_op_data(op_data);
2881         RETURN(rc);
2882 }
2883
2884 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2885                                  unsigned long arg)
2886 {
2887         struct inode            *inode = file_inode(file);
2888         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2889         struct ll_inode_info    *lli = ll_i2info(inode);
2890         struct obd_client_handle *och = NULL;
2891         bool lease_broken;
2892         fmode_t fmode = 0;
2893         enum mds_op_bias bias = 0;
2894         void *data = NULL;
2895         size_t data_size = 0;
2896         long rc;
2897         ENTRY;
2898
2899         mutex_lock(&lli->lli_och_mutex);
2900         if (fd->fd_lease_och != NULL) {
2901                 och = fd->fd_lease_och;
2902                 fd->fd_lease_och = NULL;
2903         }
2904         mutex_unlock(&lli->lli_och_mutex);
2905
2906         if (och == NULL)
2907                 GOTO(out, rc = -ENOLCK);
2908
2909         fmode = och->och_flags;
2910
2911         if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
2912                 if (ioc->lil_count > IOC_IDS_MAX)
2913                         GOTO(out, rc = -EINVAL);
2914
2915                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2916                 OBD_ALLOC(data, data_size);
2917                 if (!data)
2918                         GOTO(out, rc = -ENOMEM);
2919
2920                 if (copy_from_user(data, (void __user *)arg, data_size))
2921                         GOTO(out, rc = -EFAULT);
2922
2923                 bias = MDS_CLOSE_RESYNC_DONE;
2924         }
2925
2926         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2927         if (rc < 0)
2928                 GOTO(out, rc);
2929
2930         rc = ll_lease_och_release(inode, file);
2931         if (rc < 0)
2932                 GOTO(out, rc);
2933
2934         if (lease_broken)
2935                 fmode = 0;
2936         EXIT;
2937
2938 out:
2939         if (data)
2940                 OBD_FREE(data, data_size);
2941         if (!rc)
2942                 rc = ll_lease_type_from_fmode(fmode);
2943         RETURN(rc);
2944 }
2945
2946 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
2947                               unsigned long arg)
2948 {
2949         struct inode *inode = file_inode(file);
2950         struct ll_inode_info *lli = ll_i2info(inode);
2951         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2952         struct obd_client_handle *och = NULL;
2953         __u64 open_flags = 0;
2954         bool lease_broken;
2955         fmode_t fmode;
2956         long rc;
2957         ENTRY;
2958
2959         switch (ioc->lil_mode) {
2960         case LL_LEASE_WRLCK:
2961                 if (!(file->f_mode & FMODE_WRITE))
2962                         RETURN(-EPERM);
2963                 fmode = FMODE_WRITE;
2964                 break;
2965         case LL_LEASE_RDLCK:
2966                 if (!(file->f_mode & FMODE_READ))
2967                         RETURN(-EPERM);
2968                 fmode = FMODE_READ;
2969                 break;
2970         case LL_LEASE_UNLCK:
2971                 RETURN(ll_file_unlock_lease(file, ioc, arg));
2972         default:
2973                 RETURN(-EINVAL);
2974         }
2975
2976         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2977
2978         /* apply for lease */
2979         if (ioc->lil_flags & LL_LEASE_RESYNC)
2980                 open_flags = MDS_OPEN_RESYNC;
2981         och = ll_lease_open(inode, file, fmode, open_flags);
2982         if (IS_ERR(och))
2983                 RETURN(PTR_ERR(och));
2984
2985         if (ioc->lil_flags & LL_LEASE_RESYNC) {
2986                 rc = ll_lease_file_resync(och, inode);
2987                 if (rc) {
2988                         ll_lease_close(och, inode, NULL);
2989                         RETURN(rc);
2990                 }
2991                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
2992                 if (rc) {
2993                         ll_lease_close(och, inode, NULL);
2994                         RETURN(rc);
2995                 }
2996         }
2997
2998         rc = 0;
2999         mutex_lock(&lli->lli_och_mutex);
3000         if (fd->fd_lease_och == NULL) {
3001                 fd->fd_lease_och = och;
3002                 och = NULL;
3003         }
3004         mutex_unlock(&lli->lli_och_mutex);
3005         if (och != NULL) {
3006                 /* impossible now that only excl is supported for now */
3007                 ll_lease_close(och, inode, &lease_broken);
3008                 rc = -EBUSY;
3009         }
3010         RETURN(rc);
3011 }
3012
3013 static long
3014 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3015 {
3016         struct inode            *inode = file_inode(file);
3017         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3018         int                      flags, rc;
3019         ENTRY;
3020
3021         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3022                PFID(ll_inode2fid(inode)), inode, cmd);
3023         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3024
3025         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3026         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3027                 RETURN(-ENOTTY);
3028
3029         switch(cmd) {
3030         case LL_IOC_GETFLAGS:
3031                 /* Get the current value of the file flags */
3032                 return put_user(fd->fd_flags, (int __user *)arg);
3033         case LL_IOC_SETFLAGS:
3034         case LL_IOC_CLRFLAGS:
3035                 /* Set or clear specific file flags */
3036                 /* XXX This probably needs checks to ensure the flags are
3037                  *     not abused, and to handle any flag side effects.
3038                  */
3039                 if (get_user(flags, (int __user *) arg))
3040                         RETURN(-EFAULT);
3041
3042                 if (cmd == LL_IOC_SETFLAGS) {
3043                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3044                             !(file->f_flags & O_DIRECT)) {
3045                                 CERROR("%s: unable to disable locking on "
3046                                        "non-O_DIRECT file\n", current->comm);
3047                                 RETURN(-EINVAL);
3048                         }
3049
3050                         fd->fd_flags |= flags;
3051                 } else {
3052                         fd->fd_flags &= ~flags;
3053                 }
3054                 RETURN(0);
3055         case LL_IOC_LOV_SETSTRIPE:
3056         case LL_IOC_LOV_SETSTRIPE_NEW:
3057                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3058         case LL_IOC_LOV_SETEA:
3059                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3060         case LL_IOC_LOV_SWAP_LAYOUTS: {
3061                 struct file *file2;
3062                 struct lustre_swap_layouts lsl;
3063                 __u64 intent;
3064
3065                 if (copy_from_user(&lsl, (char __user *)arg,
3066                                    sizeof(struct lustre_swap_layouts)))
3067                         RETURN(-EFAULT);
3068
3069                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3070                         RETURN(-EPERM);
3071
3072                 file2 = fget(lsl.sl_fd);
3073                 if (file2 == NULL)
3074                         RETURN(-EBADF);
3075
3076                 /* O_WRONLY or O_RDWR */
3077                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3078                         GOTO(out, rc = -EPERM);
3079
3080                 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
3081                 if (intent) {
3082                         struct inode                    *inode2;
3083                         struct ll_inode_info            *lli;
3084                         struct obd_client_handle        *och = NULL;
3085
3086                         lli = ll_i2info(inode);
3087                         mutex_lock(&lli->lli_och_mutex);
3088                         if (fd->fd_lease_och != NULL) {
3089                                 och = fd->fd_lease_och;
3090                                 fd->fd_lease_och = NULL;
3091                         }
3092                         mutex_unlock(&lli->lli_och_mutex);
3093                         if (och == NULL)
3094                                 GOTO(out, rc = -ENOLCK);
3095                         inode2 = file_inode(file2);
3096                         rc = ll_swap_layouts_close(och, inode, inode2, intent);
3097                 } else {
3098                         rc = ll_swap_layouts(file, file2, &lsl);
3099                 }
3100 out:
3101                 fput(file2);
3102                 RETURN(rc);
3103         }
3104         case LL_IOC_LOV_GETSTRIPE:
3105         case LL_IOC_LOV_GETSTRIPE_NEW:
3106                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3107         case FSFILT_IOC_GETFLAGS:
3108         case FSFILT_IOC_SETFLAGS:
3109                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3110         case FSFILT_IOC_GETVERSION_OLD:
3111         case FSFILT_IOC_GETVERSION:
3112                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3113         case LL_IOC_GROUP_LOCK:
3114                 RETURN(ll_get_grouplock(inode, file, arg));
3115         case LL_IOC_GROUP_UNLOCK:
3116                 RETURN(ll_put_grouplock(inode, file, arg));
3117         case IOC_OBD_STATFS:
3118                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3119
3120         /* We need to special case any other ioctls we want to handle,
3121          * to send them to the MDS/OST as appropriate and to properly
3122          * network encode the arg field.
3123         case FSFILT_IOC_SETVERSION_OLD:
3124         case FSFILT_IOC_SETVERSION:
3125         */
3126         case LL_IOC_FLUSHCTX:
3127                 RETURN(ll_flush_ctx(inode));
3128         case LL_IOC_PATH2FID: {
3129                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3130                                  sizeof(struct lu_fid)))
3131                         RETURN(-EFAULT);
3132
3133                 RETURN(0);
3134         }
3135         case LL_IOC_GETPARENT:
3136                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3137
3138         case OBD_IOC_FID2PATH:
3139                 RETURN(ll_fid2path(inode, (void __user *)arg));
3140         case LL_IOC_DATA_VERSION: {
3141                 struct ioc_data_version idv;
3142                 int rc;
3143
3144                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3145                         RETURN(-EFAULT);
3146
3147                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3148                 rc = ll_ioc_data_version(inode, &idv);
3149
3150                 if (rc == 0 &&
3151                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3152                         RETURN(-EFAULT);
3153
3154                 RETURN(rc);
3155         }
3156
3157         case LL_IOC_GET_MDTIDX: {
3158                 int mdtidx;
3159
3160                 mdtidx = ll_get_mdt_idx(inode);
3161                 if (mdtidx < 0)
3162                         RETURN(mdtidx);
3163
3164                 if (put_user((int)mdtidx, (int __user *)arg))
3165                         RETURN(-EFAULT);
3166
3167                 RETURN(0);
3168         }
3169         case OBD_IOC_GETDTNAME:
3170         case OBD_IOC_GETMDNAME:
3171                 RETURN(ll_get_obd_name(inode, cmd, arg));
3172         case LL_IOC_HSM_STATE_GET: {
3173                 struct md_op_data       *op_data;
3174                 struct hsm_user_state   *hus;
3175                 int                      rc;
3176
3177                 OBD_ALLOC_PTR(hus);
3178                 if (hus == NULL)
3179                         RETURN(-ENOMEM);
3180
3181                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3182                                              LUSTRE_OPC_ANY, hus);
3183                 if (IS_ERR(op_data)) {
3184                         OBD_FREE_PTR(hus);
3185                         RETURN(PTR_ERR(op_data));
3186                 }
3187
3188                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3189                                    op_data, NULL);
3190
3191                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3192                         rc = -EFAULT;
3193
3194                 ll_finish_md_op_data(op_data);
3195                 OBD_FREE_PTR(hus);
3196                 RETURN(rc);
3197         }
3198         case LL_IOC_HSM_STATE_SET: {
3199                 struct hsm_state_set    *hss;
3200                 int                      rc;
3201
3202                 OBD_ALLOC_PTR(hss);
3203                 if (hss == NULL)
3204                         RETURN(-ENOMEM);
3205
3206                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3207                         OBD_FREE_PTR(hss);
3208                         RETURN(-EFAULT);
3209                 }
3210
3211                 rc = ll_hsm_state_set(inode, hss);
3212
3213                 OBD_FREE_PTR(hss);
3214                 RETURN(rc);
3215         }
3216         case LL_IOC_HSM_ACTION: {
3217                 struct md_op_data               *op_data;
3218                 struct hsm_current_action       *hca;
3219                 int                              rc;
3220
3221                 OBD_ALLOC_PTR(hca);
3222                 if (hca == NULL)
3223                         RETURN(-ENOMEM);
3224
3225                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3226                                              LUSTRE_OPC_ANY, hca);
3227                 if (IS_ERR(op_data)) {
3228                         OBD_FREE_PTR(hca);
3229                         RETURN(PTR_ERR(op_data));
3230                 }
3231
3232                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3233                                    op_data, NULL);
3234
3235                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3236                         rc = -EFAULT;
3237
3238                 ll_finish_md_op_data(op_data);
3239                 OBD_FREE_PTR(hca);
3240                 RETURN(rc);
3241         }
3242         case LL_IOC_SET_LEASE_OLD: {
3243                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3244
3245                 RETURN(ll_file_set_lease(file, &ioc, 0));
3246         }
3247         case LL_IOC_SET_LEASE: {
3248                 struct ll_ioc_lease ioc;
3249
3250                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3251                         RETURN(-EFAULT);
3252
3253                 RETURN(ll_file_set_lease(file, &ioc, arg));
3254         }
3255         case LL_IOC_GET_LEASE: {
3256                 struct ll_inode_info *lli = ll_i2info(inode);
3257                 struct ldlm_lock *lock = NULL;
3258                 fmode_t fmode = 0;
3259
3260                 mutex_lock(&lli->lli_och_mutex);
3261                 if (fd->fd_lease_och != NULL) {
3262                         struct obd_client_handle *och = fd->fd_lease_och;
3263
3264                         lock = ldlm_handle2lock(&och->och_lease_handle);
3265                         if (lock != NULL) {
3266                                 lock_res_and_lock(lock);
3267                                 if (!ldlm_is_cancel(lock))
3268                                         fmode = och->och_flags;
3269
3270                                 unlock_res_and_lock(lock);
3271                                 LDLM_LOCK_PUT(lock);
3272                         }
3273                 }
3274                 mutex_unlock(&lli->lli_och_mutex);
3275
3276                 RETURN(ll_lease_type_from_fmode(fmode));
3277         }
3278         case LL_IOC_HSM_IMPORT: {
3279                 struct hsm_user_import *hui;
3280
3281                 OBD_ALLOC_PTR(hui);
3282                 if (hui == NULL)
3283                         RETURN(-ENOMEM);
3284
3285                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3286                         OBD_FREE_PTR(hui);
3287                         RETURN(-EFAULT);
3288                 }
3289
3290                 rc = ll_hsm_import(inode, file, hui);
3291
3292                 OBD_FREE_PTR(hui);
3293                 RETURN(rc);
3294         }
3295         case LL_IOC_FUTIMES_3: {
3296                 struct ll_futimes_3 lfu;
3297
3298                 if (copy_from_user(&lfu,
3299                                    (const struct ll_futimes_3 __user *)arg,
3300                                    sizeof(lfu)))
3301                         RETURN(-EFAULT);
3302
3303                 RETURN(ll_file_futimes_3(file, &lfu));
3304         }
3305         case LL_IOC_LADVISE: {
3306                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3307                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3308                 int i;
3309                 int num_advise;
3310                 int alloc_size = sizeof(*k_ladvise_hdr);
3311
3312                 rc = 0;
3313                 u_ladvise_hdr = (void __user *)arg;
3314                 OBD_ALLOC_PTR(k_ladvise_hdr);
3315                 if (k_ladvise_hdr == NULL)
3316                         RETURN(-ENOMEM);
3317
3318                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3319                         GOTO(out_ladvise, rc = -EFAULT);
3320
3321                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3322                     k_ladvise_hdr->lah_count < 1)
3323                         GOTO(out_ladvise, rc = -EINVAL);
3324
3325                 num_advise = k_ladvise_hdr->lah_count;
3326                 if (num_advise >= LAH_COUNT_MAX)
3327                         GOTO(out_ladvise, rc = -EFBIG);
3328
3329                 OBD_FREE_PTR(k_ladvise_hdr);
3330                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3331                                       lah_advise[num_advise]);
3332                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3333                 if (k_ladvise_hdr == NULL)
3334                         RETURN(-ENOMEM);
3335
3336                 /*
3337                  * TODO: submit multiple advices to one server in a single RPC
3338                  */
3339                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3340                         GOTO(out_ladvise, rc = -EFAULT);
3341
3342                 for (i = 0; i < num_advise; i++) {
3343                         struct llapi_lu_ladvise *k_ladvise =
3344                                         &k_ladvise_hdr->lah_advise[i];
3345                         struct llapi_lu_ladvise __user *u_ladvise =
3346                                         &u_ladvise_hdr->lah_advise[i];
3347
3348                         rc = ll_ladvise_sanity(inode, k_ladvise);
3349                         if (rc)
3350                                 GOTO(out_ladvise, rc);
3351
3352                         switch (k_ladvise->lla_advice) {
3353                         case LU_LADVISE_LOCKNOEXPAND:
3354                                 rc = ll_lock_noexpand(file,
3355                                                k_ladvise->lla_peradvice_flags);
3356                                 GOTO(out_ladvise, rc);
3357                         case LU_LADVISE_LOCKAHEAD:
3358
3359                                 rc = ll_file_lock_ahead(file, k_ladvise);
3360
3361                                 if (rc < 0)
3362                                         GOTO(out_ladvise, rc);
3363
3364                                 if (put_user(rc,
3365                                              &u_ladvise->lla_lockahead_result))
3366                                         GOTO(out_ladvise, rc = -EFAULT);
3367                                 break;
3368                         default:
3369                                 rc = ll_ladvise(inode, file,
3370                                                 k_ladvise_hdr->lah_flags,
3371                                                 k_ladvise);
3372                                 if (rc)
3373                                         GOTO(out_ladvise, rc);
3374                                 break;
3375                         }
3376
3377                 }
3378
3379 out_ladvise:
3380                 OBD_FREE(k_ladvise_hdr, alloc_size);
3381                 RETURN(rc);
3382         }
3383         case LL_IOC_FLR_SET_MIRROR: {
3384                 /* mirror I/O must be direct to avoid polluting page cache
3385                  * by stale data. */
3386                 if (!(file->f_flags & O_DIRECT))
3387                         RETURN(-EINVAL);
3388
3389                 fd->fd_designated_mirror = (__u32)arg;
3390                 RETURN(0);
3391         }
3392         case LL_IOC_FSGETXATTR:
3393                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3394         case LL_IOC_FSSETXATTR:
3395                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3396         case BLKSSZGET:
3397                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3398         default:
3399                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3400                                      (void __user *)arg));
3401         }
3402 }
3403
3404 #ifndef HAVE_FILE_LLSEEK_SIZE
3405 static inline loff_t
3406 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3407 {
3408         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3409                 return -EINVAL;
3410         if (offset > maxsize)
3411                 return -EINVAL;
3412
3413         if (offset != file->f_pos) {
3414                 file->f_pos = offset;
3415                 file->f_version = 0;
3416         }
3417         return offset;
3418 }
3419
3420 static loff_t
3421 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3422                 loff_t maxsize, loff_t eof)
3423 {
3424         struct inode *inode = file_inode(file);
3425
3426         switch (origin) {
3427         case SEEK_END:
3428                 offset += eof;
3429                 break;
3430         case SEEK_CUR:
3431                 /*
3432                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3433                  * position-querying operation.  Avoid rewriting the "same"
3434                  * f_pos value back to the file because a concurrent read(),
3435                  * write() or lseek() might have altered it
3436                  */
3437                 if (offset == 0)
3438                         return file->f_pos;
3439                 /*
3440                  * f_lock protects against read/modify/write race with other
3441                  * SEEK_CURs. Note that parallel writes and reads behave
3442                  * like SEEK_SET.
3443                  */
3444                 inode_lock(inode);
3445                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3446                 inode_unlock(inode);
3447                 return offset;
3448         case SEEK_DATA:
3449                 /*
3450                  * In the generic case the entire file is data, so as long as
3451                  * offset isn't at the end of the file then the offset is data.
3452                  */
3453                 if (offset >= eof)
3454                         return -ENXIO;
3455                 break;
3456         case SEEK_HOLE:
3457                 /*
3458                  * There is a virtual hole at the end of the file, so as long as
3459                  * offset isn't i_size or larger, return i_size.
3460                  */
3461                 if (offset >= eof)
3462                         return -ENXIO;
3463                 offset = eof;
3464                 break;
3465         }
3466
3467         return llseek_execute(file, offset, maxsize);
3468 }
3469 #endif
3470
3471 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3472 {
3473         struct inode *inode = file_inode(file);
3474         loff_t retval, eof = 0;
3475
3476         ENTRY;
3477         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3478                            (origin == SEEK_CUR) ? file->f_pos : 0);
3479         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3480                PFID(ll_inode2fid(inode)), inode, retval, retval,
3481                origin);
3482         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3483
3484         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3485                 retval = ll_glimpse_size(inode);
3486                 if (retval != 0)
3487                         RETURN(retval);
3488                 eof = i_size_read(inode);
3489         }
3490
3491         retval = ll_generic_file_llseek_size(file, offset, origin,
3492                                           ll_file_maxbytes(inode), eof);
3493         RETURN(retval);
3494 }
3495
3496 static int ll_flush(struct file *file, fl_owner_t id)
3497 {
3498         struct inode *inode = file_inode(file);
3499         struct ll_inode_info *lli = ll_i2info(inode);
3500         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3501         int rc, err;
3502
3503         LASSERT(!S_ISDIR(inode->i_mode));
3504
3505         /* catch async errors that were recorded back when async writeback
3506          * failed for pages in this mapping. */
3507         rc = lli->lli_async_rc;
3508         lli->lli_async_rc = 0;
3509         if (lli->lli_clob != NULL) {
3510                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3511                 if (rc == 0)
3512                         rc = err;
3513         }
3514
3515         /* The application has been told write failure already.
3516          * Do not report failure again. */
3517         if (fd->fd_write_failed)
3518                 return 0;
3519         return rc ? -EIO : 0;
3520 }
3521
3522 /**
3523  * Called to make sure a portion of file has been written out.
3524  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3525  *
3526  * Return how many pages have been written.
3527  */
3528 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3529                        enum cl_fsync_mode mode, int ignore_layout)
3530 {
3531         struct lu_env *env;
3532         struct cl_io *io;
3533         struct cl_fsync_io *fio;
3534         int result;
3535         __u16 refcheck;
3536         ENTRY;
3537
3538         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3539             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3540                 RETURN(-EINVAL);
3541
3542         env = cl_env_get(&refcheck);
3543         if (IS_ERR(env))
3544                 RETURN(PTR_ERR(env));
3545
3546         io = vvp_env_thread_io(env);
3547         io->ci_obj = ll_i2info(inode)->lli_clob;
3548         io->ci_ignore_layout = ignore_layout;
3549
3550         /* initialize parameters for sync */
3551         fio = &io->u.ci_fsync;
3552         fio->fi_start = start;
3553         fio->fi_end = end;
3554         fio->fi_fid = ll_inode2fid(inode);
3555         fio->fi_mode = mode;
3556         fio->fi_nr_written = 0;
3557
3558         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3559                 result = cl_io_loop(env, io);
3560         else
3561                 result = io->ci_result;
3562         if (result == 0)
3563                 result = fio->fi_nr_written;
3564         cl_io_fini(env, io);
3565         cl_env_put(env, &refcheck);
3566
3567         RETURN(result);
3568 }
3569
3570 /*
3571  * When dentry is provided (the 'else' case), file_dentry() may be
3572  * null and dentry must be used directly rather than pulled from
3573  * file_dentry() as is done otherwise.
3574  */
3575
3576 #ifdef HAVE_FILE_FSYNC_4ARGS
3577 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3578 {
3579         struct dentry *dentry = file_dentry(file);
3580         bool lock_inode;
3581 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3582 int ll_fsync(struct file *file, int datasync)
3583 {
3584         struct dentry *dentry = file_dentry(file);
3585         loff_t start = 0;
3586         loff_t end = LLONG_MAX;
3587 #else
3588 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3589 {
3590         loff_t start = 0;
3591         loff_t end = LLONG_MAX;
3592 #endif
3593         struct inode *inode = dentry->d_inode;
3594         struct ll_inode_info *lli = ll_i2info(inode);
3595         struct ptlrpc_request *req;
3596         int rc, err;
3597         ENTRY;
3598
3599         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3600                PFID(ll_inode2fid(inode)), inode);
3601         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3602
3603 #ifdef HAVE_FILE_FSYNC_4ARGS
3604         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3605         lock_inode = !lli->lli_inode_locked;
3606         if (lock_inode)
3607                 inode_lock(inode);
3608 #else
3609         /* fsync's caller has already called _fdata{sync,write}, we want
3610          * that IO to finish before calling the osc and mdc sync methods */
3611         rc = filemap_fdatawait(inode->i_mapping);
3612 #endif
3613
3614         /* catch async errors that were recorded back when async writeback
3615          * failed for pages in this mapping. */
3616         if (!S_ISDIR(inode->i_mode)) {
3617                 err = lli->lli_async_rc;
3618                 lli->lli_async_rc = 0;
3619                 if (rc == 0)
3620                         rc = err;
3621                 if (lli->lli_clob != NULL) {
3622                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3623                         if (rc == 0)
3624                                 rc = err;
3625                 }
3626         }
3627
3628         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3629         if (!rc)
3630                 rc = err;
3631         if (!err)
3632                 ptlrpc_req_finished(req);
3633
3634         if (S_ISREG(inode->i_mode)) {
3635                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3636
3637                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3638                 if (rc == 0 && err < 0)
3639                         rc = err;
3640                 if (rc < 0)
3641                         fd->fd_write_failed = true;
3642                 else
3643                         fd->fd_write_failed = false;
3644         }
3645
3646 #ifdef HAVE_FILE_FSYNC_4ARGS
3647         if (lock_inode)
3648                 inode_unlock(inode);
3649 #endif
3650         RETURN(rc);
3651 }
3652
3653 static int
3654 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3655 {
3656         struct inode *inode = file_inode(file);
3657         struct ll_sb_info *sbi = ll_i2sbi(inode);
3658         struct ldlm_enqueue_info einfo = {
3659                 .ei_type        = LDLM_FLOCK,
3660                 .ei_cb_cp       = ldlm_flock_completion_ast,
3661                 .ei_cbdata      = file_lock,
3662         };
3663         struct md_op_data *op_data;
3664         struct lustre_handle lockh = { 0 };
3665         union ldlm_policy_data flock = { { 0 } };
3666         int fl_type = file_lock->fl_type;
3667         __u64 flags = 0;
3668         int rc;
3669         int rc2 = 0;
3670         ENTRY;
3671
3672         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3673                PFID(ll_inode2fid(inode)), file_lock);
3674
3675         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3676
3677         if (file_lock->fl_flags & FL_FLOCK) {
3678                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3679                 /* flocks are whole-file locks */
3680                 flock.l_flock.end = OFFSET_MAX;
3681                 /* For flocks owner is determined by the local file desctiptor*/
3682                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3683         } else if (file_lock->fl_flags & FL_POSIX) {
3684                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3685                 flock.l_flock.start = file_lock->fl_start;
3686                 flock.l_flock.end = file_lock->fl_end;
3687         } else {
3688                 RETURN(-EINVAL);
3689         }
3690         flock.l_flock.pid = file_lock->fl_pid;
3691
3692         /* Somewhat ugly workaround for svc lockd.
3693          * lockd installs custom fl_lmops->lm_compare_owner that checks
3694          * for the fl_owner to be the same (which it always is on local node
3695          * I guess between lockd processes) and then compares pid.
3696          * As such we assign pid to the owner field to make it all work,
3697          * conflict with normal locks is unlikely since pid space and
3698          * pointer space for current->files are not intersecting */
3699         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3700                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3701
3702         switch (fl_type) {
3703         case F_RDLCK:
3704                 einfo.ei_mode = LCK_PR;
3705                 break;
3706         case F_UNLCK:
3707                 /* An unlock request may or may not have any relation to
3708                  * existing locks so we may not be able to pass a lock handle
3709                  * via a normal ldlm_lock_cancel() request. The request may even
3710                  * unlock a byte range in the middle of an existing lock. In
3711                  * order to process an unlock request we need all of the same
3712                  * information that is given with a normal read or write record
3713                  * lock request. To avoid creating another ldlm unlock (cancel)
3714                  * message we'll treat a LCK_NL flock request as an unlock. */
3715                 einfo.ei_mode = LCK_NL;
3716                 break;
3717         case F_WRLCK:
3718                 einfo.ei_mode = LCK_PW;
3719                 break;
3720         default:
3721                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3722                 RETURN (-ENOTSUPP);
3723         }
3724
3725         switch (cmd) {
3726         case F_SETLKW:
3727 #ifdef F_SETLKW64
3728         case F_SETLKW64:
3729 #endif
3730                 flags = 0;
3731                 break;
3732         case F_SETLK:
3733 #ifdef F_SETLK64
3734         case F_SETLK64:
3735 #endif
3736                 flags = LDLM_FL_BLOCK_NOWAIT;
3737                 break;
3738         case F_GETLK:
3739 #ifdef F_GETLK64
3740         case F_GETLK64:
3741 #endif
3742                 flags = LDLM_FL_TEST_LOCK;
3743                 break;
3744         default:
3745                 CERROR("unknown fcntl lock command: %d\n", cmd);
3746                 RETURN (-EINVAL);
3747         }
3748
3749         /* Save the old mode so that if the mode in the lock changes we
3750          * can decrement the appropriate reader or writer refcount. */
3751         file_lock->fl_type = einfo.ei_mode;
3752
3753         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3754                                      LUSTRE_OPC_ANY, NULL);
3755         if (IS_ERR(op_data))
3756                 RETURN(PTR_ERR(op_data));
3757
3758         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3759                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3760                flock.l_flock.pid, flags, einfo.ei_mode,
3761                flock.l_flock.start, flock.l_flock.end);
3762
3763         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3764                         flags);
3765
3766         /* Restore the file lock type if not TEST lock. */
3767         if (!(flags & LDLM_FL_TEST_LOCK))
3768                 file_lock->fl_type = fl_type;
3769
3770 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3771         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3772             !(flags & LDLM_FL_TEST_LOCK))
3773                 rc2  = locks_lock_file_wait(file, file_lock);
3774 #else
3775         if ((file_lock->fl_flags & FL_FLOCK) &&
3776             (rc == 0 || file_lock->fl_type == F_UNLCK))
3777                 rc2  = flock_lock_file_wait(file, file_lock);
3778         if ((file_lock->fl_flags & FL_POSIX) &&
3779             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3780             !(flags & LDLM_FL_TEST_LOCK))
3781                 rc2  = posix_lock_file_wait(file, file_lock);
3782 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3783
3784         if (rc2 && file_lock->fl_type != F_UNLCK) {
3785                 einfo.ei_mode = LCK_NL;
3786                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3787                            &lockh, flags);
3788                 rc = rc2;
3789         }
3790
3791         ll_finish_md_op_data(op_data);
3792
3793         RETURN(rc);
3794 }
3795
3796 int ll_get_fid_by_name(struct inode *parent, const char *name,
3797                        int namelen, struct lu_fid *fid,
3798                        struct inode **inode)
3799 {
3800         struct md_op_data       *op_data = NULL;
3801         struct mdt_body         *body;
3802         struct ptlrpc_request   *req;
3803         int                     rc;
3804         ENTRY;
3805
3806         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3807                                      LUSTRE_OPC_ANY, NULL);
3808         if (IS_ERR(op_data))
3809                 RETURN(PTR_ERR(op_data));
3810
3811         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3812         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3813         ll_finish_md_op_data(op_data);
3814         if (rc < 0)
3815                 RETURN(rc);
3816
3817         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3818         if (body == NULL)
3819                 GOTO(out_req, rc = -EFAULT);
3820         if (fid != NULL)
3821                 *fid = body->mbo_fid1;
3822
3823         if (inode != NULL)
3824                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3825 out_req:
3826         ptlrpc_req_finished(req);
3827         RETURN(rc);
3828 }
3829
3830 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3831                const char *name, int namelen)
3832 {
3833         struct dentry         *dchild = NULL;
3834         struct inode          *child_inode = NULL;
3835         struct md_op_data     *op_data;
3836         struct ptlrpc_request *request = NULL;
3837         struct obd_client_handle *och = NULL;
3838         struct qstr           qstr;
3839         struct mdt_body         *body;
3840         int                    rc;
3841         __u64                   data_version = 0;
3842         ENTRY;
3843
3844         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3845                name, PFID(ll_inode2fid(parent)), mdtidx);
3846
3847         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3848                                      0, LUSTRE_OPC_ANY, NULL);
3849         if (IS_ERR(op_data))
3850                 RETURN(PTR_ERR(op_data));
3851
3852         /* Get child FID first */
3853         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3854         qstr.name = name;
3855         qstr.len = namelen;
3856         dchild = d_lookup(file_dentry(file), &qstr);
3857         if (dchild != NULL) {
3858                 if (dchild->d_inode != NULL)
3859                         child_inode = igrab(dchild->d_inode);
3860                 dput(dchild);
3861         }
3862
3863         if (child_inode == NULL) {
3864                 rc = ll_get_fid_by_name(parent, name, namelen,
3865                                         &op_data->op_fid3, &child_inode);
3866                 if (rc != 0)
3867                         GOTO(out_free, rc);
3868         }
3869
3870         if (child_inode == NULL)
3871                 GOTO(out_free, rc = -EINVAL);
3872
3873         /*
3874          * lfs migrate command needs to be blocked on the client
3875          * by checking the migrate FID against the FID of the
3876          * filesystem root.
3877          */
3878         if (child_inode == parent->i_sb->s_root->d_inode)
3879                 GOTO(out_iput, rc = -EINVAL);
3880
3881         inode_lock(child_inode);
3882         op_data->op_fid3 = *ll_inode2fid(child_inode);
3883         if (!fid_is_sane(&op_data->op_fid3)) {
3884                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3885                        ll_get_fsname(parent->i_sb, NULL, 0), name,
3886                        PFID(&op_data->op_fid3));
3887                 GOTO(out_unlock, rc = -EINVAL);
3888         }
3889
3890         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3891         if (rc < 0)
3892                 GOTO(out_unlock, rc);
3893
3894         if (rc == mdtidx) {
3895                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3896                        PFID(&op_data->op_fid3), mdtidx);
3897                 GOTO(out_unlock, rc = 0);
3898         }
3899 again:
3900         if (S_ISREG(child_inode->i_mode)) {
3901                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3902                 if (IS_ERR(och)) {
3903                         rc = PTR_ERR(och);
3904                         och = NULL;
3905                         GOTO(out_unlock, rc);
3906                 }
3907
3908                 rc = ll_data_version(child_inode, &data_version,
3909                                      LL_DV_WR_FLUSH);
3910                 if (rc != 0)
3911                         GOTO(out_close, rc);
3912
3913                 op_data->op_handle = och->och_fh;
3914                 op_data->op_data = och->och_mod;
3915                 op_data->op_data_version = data_version;
3916                 op_data->op_lease_handle = och->och_lease_handle;
3917                 op_data->op_bias |= MDS_RENAME_MIGRATE;
3918         }
3919
3920         op_data->op_mds = mdtidx;
3921         op_data->op_cli_flags = CLI_MIGRATE;
3922         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3923                        namelen, name, namelen, &request);
3924         if (rc == 0) {
3925                 LASSERT(request != NULL);
3926                 ll_update_times(request, parent);
3927
3928                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3929                 LASSERT(body != NULL);
3930
3931                 /* If the server does release layout lock, then we cleanup
3932                  * the client och here, otherwise release it in out_close: */
3933                 if (och != NULL &&
3934                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3935                         obd_mod_put(och->och_mod);
3936                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3937                                                   och);
3938                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3939                         OBD_FREE_PTR(och);
3940                         och = NULL;
3941                 }
3942         }
3943
3944         if (request != NULL) {
3945                 ptlrpc_req_finished(request);
3946                 request = NULL;
3947         }
3948
3949         /* Try again if the file layout has changed. */
3950         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3951                 goto again;
3952
3953 out_close:
3954         if (och != NULL) /* close the file */
3955                 ll_lease_close(och, child_inode, NULL);
3956         if (rc == 0)
3957                 clear_nlink(child_inode);
3958 out_unlock:
3959         inode_unlock(child_inode);
3960 out_iput:
3961         iput(child_inode);
3962 out_free:
3963         ll_finish_md_op_data(op_data);
3964         RETURN(rc);
3965 }
3966
3967 static int
3968 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3969 {
3970         ENTRY;
3971
3972         RETURN(-ENOSYS);
3973 }
3974
3975 /**
3976  * test if some locks matching bits and l_req_mode are acquired
3977  * - bits can be in different locks
3978  * - if found clear the common lock bits in *bits
3979  * - the bits not found, are kept in *bits
3980  * \param inode [IN]
3981  * \param bits [IN] searched lock bits [IN]
3982  * \param l_req_mode [IN] searched lock mode
3983  * \retval boolean, true iff all bits are found
3984  */
3985 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3986 {
3987         struct lustre_handle lockh;
3988         union ldlm_policy_data policy;
3989         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3990                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3991         struct lu_fid *fid;
3992         __u64 flags;
3993         int i;
3994         ENTRY;
3995
3996         if (!inode)
3997                RETURN(0);
3998
3999         fid = &ll_i2info(inode)->lli_fid;
4000         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4001                ldlm_lockname[mode]);
4002
4003         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4004         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4005                 policy.l_inodebits.bits = *bits & (1 << i);
4006                 if (policy.l_inodebits.bits == 0)
4007                         continue;
4008
4009                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4010                                   &policy, mode, &lockh)) {
4011                         struct ldlm_lock *lock;
4012
4013                         lock = ldlm_handle2lock(&lockh);
4014                         if (lock) {
4015                                 *bits &=
4016                                       ~(lock->l_policy_data.l_inodebits.bits);
4017                                 LDLM_LOCK_PUT(lock);
4018                         } else {
4019                                 *bits &= ~policy.l_inodebits.bits;
4020                         }
4021                 }
4022         }
4023         RETURN(*bits == 0);
4024 }
4025
4026 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4027                                struct lustre_handle *lockh, __u64 flags,
4028                                enum ldlm_mode mode)
4029 {
4030         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4031         struct lu_fid *fid;
4032         enum ldlm_mode rc;
4033         ENTRY;
4034
4035         fid = &ll_i2info(inode)->lli_fid;
4036         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4037
4038         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4039                            fid, LDLM_IBITS, &policy, mode, lockh);
4040
4041         RETURN(rc);
4042 }
4043
4044 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4045 {
4046         /* Already unlinked. Just update nlink and return success */
4047         if (rc == -ENOENT) {
4048                 clear_nlink(inode);
4049                 /* If it is striped directory, and there is bad stripe
4050                  * Let's revalidate the dentry again, instead of returning
4051                  * error */
4052                 if (S_ISDIR(inode->i_mode) &&
4053                     ll_i2info(inode)->lli_lsm_md != NULL)
4054                         return 0;
4055
4056                 /* This path cannot be hit for regular files unless in
4057                  * case of obscure races, so no need to to validate
4058                  * size. */
4059                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4060                         return 0;
4061         } else if (rc != 0) {
4062                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4063                              "%s: revalidate FID "DFID" error: rc = %d\n",
4064                              ll_get_fsname(inode->i_sb, NULL, 0),
4065                              PFID(ll_inode2fid(inode)), rc);
4066         }
4067
4068         return rc;
4069 }
4070
4071 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4072 {
4073         struct inode *inode = dentry->d_inode;
4074         struct ptlrpc_request *req = NULL;
4075         struct obd_export *exp;
4076         int rc = 0;
4077         ENTRY;
4078
4079         LASSERT(inode != NULL);
4080
4081         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4082                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4083
4084         exp = ll_i2mdexp(inode);
4085
4086         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
4087          *      But under CMD case, it caused some lock issues, should be fixed
4088          *      with new CMD ibits lock. See bug 12718 */
4089         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
4090                 struct lookup_intent oit = { .it_op = IT_GETATTR };
4091                 struct md_op_data *op_data;
4092
4093                 if (ibits == MDS_INODELOCK_LOOKUP)
4094                         oit.it_op = IT_LOOKUP;
4095
4096                 /* Call getattr by fid, so do not provide name at all. */
4097                 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
4098                                              dentry->d_inode, NULL, 0, 0,
4099                                              LUSTRE_OPC_ANY, NULL);
4100                 if (IS_ERR(op_data))
4101                         RETURN(PTR_ERR(op_data));
4102
4103                 rc = md_intent_lock(exp, op_data, &oit, &req,
4104                                     &ll_md_blocking_ast, 0);
4105                 ll_finish_md_op_data(op_data);
4106                 if (rc < 0) {
4107                         rc = ll_inode_revalidate_fini(inode, rc);
4108                         GOTO (out, rc);
4109                 }
4110
4111                 rc = ll_revalidate_it_finish(req, &oit, dentry);
4112                 if (rc != 0) {
4113                         ll_intent_release(&oit);
4114                         GOTO(out, rc);
4115                 }
4116
4117                 /* Unlinked? Unhash dentry, so it is not picked up later by
4118                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4119                    here to preserve get_cwd functionality on 2.6.
4120                    Bug 10503 */
4121                 if (!dentry->d_inode->i_nlink) {
4122                         ll_lock_dcache(inode);
4123                         d_lustre_invalidate(dentry, 0);
4124                         ll_unlock_dcache(inode);
4125                 }
4126
4127                 ll_lookup_finish_locks(&oit, dentry);
4128         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
4129                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
4130                 u64 valid = OBD_MD_FLGETATTR;
4131                 struct md_op_data *op_data;
4132                 int ealen = 0;
4133
4134                 if (S_ISREG(inode->i_mode)) {
4135                         rc = ll_get_default_mdsize(sbi, &ealen);
4136                         if (rc)
4137                                 RETURN(rc);
4138                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
4139                 }
4140
4141                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
4142                                              0, ealen, LUSTRE_OPC_ANY,
4143                                              NULL);
4144                 if (IS_ERR(op_data))
4145                         RETURN(PTR_ERR(op_data));
4146
4147                 op_data->op_valid = valid;
4148                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
4149                 ll_finish_md_op_data(op_data);
4150                 if (rc) {
4151                         rc = ll_inode_revalidate_fini(inode, rc);
4152                         RETURN(rc);
4153                 }
4154
4155                 rc = ll_prep_inode(&inode, req, NULL, NULL);
4156         }
4157 out:
4158         ptlrpc_req_finished(req);
4159         return rc;
4160 }
4161
4162 static int ll_merge_md_attr(struct inode *inode)
4163 {
4164         struct cl_attr attr = { 0 };
4165         int rc;
4166
4167         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4168         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4169                            &attr, ll_md_blocking_ast);
4170         if (rc != 0)
4171                 RETURN(rc);
4172
4173         set_nlink(inode, attr.cat_nlink);
4174         inode->i_blocks = attr.cat_blocks;
4175         i_size_write(inode, attr.cat_size);
4176
4177         ll_i2info(inode)->lli_atime = attr.cat_atime;
4178         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4179         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4180
4181         RETURN(0);
4182 }
4183
4184 static int
4185 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4186 {
4187         struct inode    *inode = dentry->d_inode;
4188         int              rc;
4189         ENTRY;
4190
4191         rc = __ll_inode_revalidate(dentry, ibits);
4192         if (rc != 0)
4193                 RETURN(rc);
4194
4195         /* if object isn't regular file, don't validate size */
4196         if (!S_ISREG(inode->i_mode)) {
4197                 if (S_ISDIR(inode->i_mode) &&
4198                     ll_i2info(inode)->lli_lsm_md != NULL) {
4199                         rc = ll_merge_md_attr(inode);
4200                         if (rc != 0)
4201                                 RETURN(rc);
4202                 }
4203
4204                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
4205                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
4206                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4207         } else {
4208                 /* In case of restore, the MDT has the right size and has
4209                  * already send it back without granting the layout lock,
4210                  * inode is up-to-date so glimpse is useless.
4211                  * Also to glimpse we need the layout, in case of a running
4212                  * restore the MDT holds the layout lock so the glimpse will
4213                  * block up to the end of restore (getattr will block)
4214                  */
4215                 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4216                         rc = ll_glimpse_size(inode);
4217         }
4218         RETURN(rc);
4219 }
4220
4221 static inline dev_t ll_compat_encode_dev(dev_t dev)
4222 {
4223         /* The compat_sys_*stat*() syscalls will fail unless the
4224          * device majors and minors are both less than 256. Note that
4225          * the value returned here will be passed through
4226          * old_encode_dev() in cp_compat_stat(). And so we are not
4227          * trying to return a valid compat (u16) device number, just
4228          * one that will pass the old_valid_dev() check. */
4229
4230         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4231 }
4232
4233 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4234 int ll_getattr(const struct path *path, struct kstat *stat,
4235                u32 request_mask, unsigned int flags)
4236
4237 {
4238         struct dentry *de = path->dentry;
4239 #else
4240 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4241 {
4242 #endif
4243         struct inode *inode = de->d_inode;
4244         struct ll_sb_info *sbi = ll_i2sbi(inode);
4245         struct ll_inode_info *lli = ll_i2info(inode);
4246         int res = 0;
4247
4248         res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4249                                       MDS_INODELOCK_LOOKUP);
4250         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4251
4252         if (res)
4253                 return res;
4254
4255         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4256
4257         if (ll_need_32bit_api(sbi)) {
4258                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4259                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4260                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4261         } else {
4262                 stat->ino = inode->i_ino;
4263                 stat->dev = inode->i_sb->s_dev;
4264                 stat->rdev = inode->i_rdev;
4265         }
4266
4267         stat->mode = inode->i_mode;
4268         stat->uid = inode->i_uid;
4269         stat->gid = inode->i_gid;
4270         stat->atime = inode->i_atime;
4271         stat->mtime = inode->i_mtime;
4272         stat->ctime = inode->i_ctime;
4273         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4274
4275         stat->nlink = inode->i_nlink;
4276         stat->size = i_size_read(inode);
4277         stat->blocks = inode->i_blocks;
4278
4279         return 0;
4280 }
4281
4282 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4283                      __u64 start, __u64 len)
4284 {
4285         int             rc;
4286         size_t          num_bytes;
4287         struct fiemap   *fiemap;
4288         unsigned int    extent_count = fieinfo->fi_extents_max;
4289
4290         num_bytes = sizeof(*fiemap) + (extent_count *
4291                                        sizeof(struct fiemap_extent));
4292         OBD_ALLOC_LARGE(fiemap, num_bytes);
4293
4294         if (fiemap == NULL)
4295                 RETURN(-ENOMEM);
4296
4297         fiemap->fm_flags = fieinfo->fi_flags;
4298         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4299         fiemap->fm_start = start;
4300         fiemap->fm_length = len;
4301         if (extent_count > 0 &&
4302             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4303                            sizeof(struct fiemap_extent)) != 0)
4304                 GOTO(out, rc = -EFAULT);
4305
4306         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4307
4308         fieinfo->fi_flags = fiemap->fm_flags;
4309         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4310         if (extent_count > 0 &&
4311             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4312                          fiemap->fm_mapped_extents *
4313                          sizeof(struct fiemap_extent)) != 0)
4314                 GOTO(out, rc = -EFAULT);
4315 out:
4316         OBD_FREE_LARGE(fiemap, num_bytes);
4317         return rc;
4318 }
4319
4320 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4321 {
4322         struct ll_inode_info *lli = ll_i2info(inode);
4323         struct posix_acl *acl = NULL;
4324         ENTRY;
4325
4326         spin_lock(&lli->lli_lock);
4327         /* VFS' acl_permission_check->check_acl will release the refcount */
4328         acl = posix_acl_dup(lli->lli_posix_acl);
4329         spin_unlock(&lli->lli_lock);
4330
4331         RETURN(acl);
4332 }
4333
4334 #ifdef HAVE_IOP_SET_ACL
4335 #ifdef CONFIG_FS_POSIX_ACL
4336 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4337 {
4338         const char *name = NULL;
4339         char *value = NULL;
4340         size_t size = 0;
4341         int rc = 0;
4342         ENTRY;
4343
4344         switch (type) {
4345         case ACL_TYPE_ACCESS:
4346                 if (acl) {
4347                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4348                         if (rc)
4349                                 GOTO(out, rc);
4350                 }
4351                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4352                 break;
4353         case ACL_TYPE_DEFAULT:
4354                 if (!S_ISDIR(inode->i_mode))
4355                         GOTO(out, rc = acl ? -EACCES : 0);
4356                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4357                 break;
4358         default:
4359                 GOTO(out, rc = -EINVAL);
4360         }
4361
4362         if (acl) {
4363                 size = posix_acl_xattr_size(acl->a_count);
4364                 value = kmalloc(size, GFP_NOFS);
4365                 if (value == NULL)
4366                         GOTO(out, rc = -ENOMEM);
4367
4368                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4369                 if (rc < 0)
4370                         GOTO(out_free, rc);
4371         }
4372
4373         /* dentry is only used for *.lov attributes so it's safe to be NULL */
4374         rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4375 out_free:
4376         kfree(value);
4377 out:
4378         if (!rc)
4379                 set_cached_acl(inode, type, acl);
4380         else
4381                 forget_cached_acl(inode, type);
4382         RETURN(rc);
4383 }
4384 #endif /* CONFIG_FS_POSIX_ACL */
4385 #endif /* HAVE_IOP_SET_ACL */
4386
4387 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4388 static int
4389 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4390 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4391 # else
4392 ll_check_acl(struct inode *inode, int mask)
4393 # endif
4394 {
4395 # ifdef CONFIG_FS_POSIX_ACL
4396         struct posix_acl *acl;
4397         int rc;
4398         ENTRY;
4399
4400 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4401         if (flags & IPERM_FLAG_RCU)
4402                 return -ECHILD;
4403 #  endif
4404         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4405
4406         if (!acl)
4407                 RETURN(-EAGAIN);
4408
4409         rc = posix_acl_permission(inode, acl, mask);
4410         posix_acl_release(acl);
4411
4412         RETURN(rc);
4413 # else /* !CONFIG_FS_POSIX_ACL */
4414         return -EAGAIN;
4415 # endif /* CONFIG_FS_POSIX_ACL */
4416 }
4417 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4418
4419 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4420 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4421 #else
4422 # ifdef HAVE_INODE_PERMISION_2ARGS
4423 int ll_inode_permission(struct inode *inode, int mask)
4424 # else
4425 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4426 # endif
4427 #endif
4428 {
4429         int rc = 0;
4430         struct ll_sb_info *sbi;
4431         struct root_squash_info *squash;
4432         struct cred *cred = NULL;
4433         const struct cred *old_cred = NULL;
4434         cfs_cap_t cap;
4435         bool squash_id = false;
4436         ENTRY;
4437
4438 #ifdef MAY_NOT_BLOCK
4439         if (mask & MAY_NOT_BLOCK)
4440                 return -ECHILD;
4441 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4442         if (flags & IPERM_FLAG_RCU)
4443                 return -ECHILD;
4444 #endif
4445
4446        /* as root inode are NOT getting validated in lookup operation,
4447         * need to do it before permission check. */
4448
4449         if (inode == inode->i_sb->s_root->d_inode) {
4450                 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4451                                            MDS_INODELOCK_LOOKUP);
4452                 if (rc)
4453                         RETURN(rc);
4454         }
4455
4456         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4457                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4458
4459         /* squash fsuid/fsgid if needed */
4460         sbi = ll_i2sbi(inode);
4461         squash = &sbi->ll_squash;
4462         if (unlikely(squash->rsi_uid != 0 &&
4463                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4464                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4465                         squash_id = true;
4466         }
4467         if (squash_id) {
4468                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4469                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4470                        squash->rsi_uid, squash->rsi_gid);
4471
4472                 /* update current process's credentials
4473                  * and FS capability */
4474                 cred = prepare_creds();
4475                 if (cred == NULL)
4476                         RETURN(-ENOMEM);
4477
4478                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4479                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4480                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4481                         if ((1 << cap) & CFS_CAP_FS_MASK)
4482                                 cap_lower(cred->cap_effective, cap);
4483                 }
4484                 old_cred = override_creds(cred);
4485         }
4486
4487         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4488         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4489         /* restore current process's credentials and FS capability */
4490         if (squash_id) {
4491                 revert_creds(old_cred);
4492                 put_cred(cred);
4493         }
4494
4495         RETURN(rc);
4496 }
4497
4498 /* -o localflock - only provides locally consistent flock locks */
4499 struct file_operations ll_file_operations = {
4500 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4501 # ifdef HAVE_SYNC_READ_WRITE
4502         .read           = new_sync_read,
4503         .write          = new_sync_write,
4504 # endif
4505         .read_iter      = ll_file_read_iter,
4506         .write_iter     = ll_file_write_iter,
4507 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4508         .read           = ll_file_read,
4509         .aio_read       = ll_file_aio_read,
4510         .write          = ll_file_write,
4511         .aio_write      = ll_file_aio_write,
4512 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4513         .unlocked_ioctl = ll_file_ioctl,
4514         .open           = ll_file_open,
4515         .release        = ll_file_release,
4516         .mmap           = ll_file_mmap,
4517         .llseek         = ll_file_seek,
4518         .splice_read    = ll_file_splice_read,
4519         .fsync          = ll_fsync,
4520         .flush          = ll_flush
4521 };
4522
4523 struct file_operations ll_file_operations_flock = {
4524 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4525 # ifdef HAVE_SYNC_READ_WRITE
4526         .read           = new_sync_read,
4527         .write          = new_sync_write,
4528 # endif /* HAVE_SYNC_READ_WRITE */
4529         .read_iter      = ll_file_read_iter,
4530         .write_iter     = ll_file_write_iter,
4531 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4532         .read           = ll_file_read,
4533         .aio_read       = ll_file_aio_read,
4534         .write          = ll_file_write,
4535         .aio_write      = ll_file_aio_write,
4536 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4537         .unlocked_ioctl = ll_file_ioctl,
4538         .open           = ll_file_open,
4539         .release        = ll_file_release,
4540         .mmap           = ll_file_mmap,
4541         .llseek         = ll_file_seek,
4542         .splice_read    = ll_file_splice_read,
4543         .fsync          = ll_fsync,
4544         .flush          = ll_flush,
4545         .flock          = ll_file_flock,
4546         .lock           = ll_file_flock
4547 };
4548
4549 /* These are for -o noflock - to return ENOSYS on flock calls */
4550 struct file_operations ll_file_operations_noflock = {
4551 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4552 # ifdef HAVE_SYNC_READ_WRITE
4553         .read           = new_sync_read,
4554         .write          = new_sync_write,
4555 # endif /* HAVE_SYNC_READ_WRITE */
4556         .read_iter      = ll_file_read_iter,
4557         .write_iter     = ll_file_write_iter,
4558 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4559         .read           = ll_file_read,
4560         .aio_read       = ll_file_aio_read,
4561         .write          = ll_file_write,
4562         .aio_write      = ll_file_aio_write,
4563 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4564         .unlocked_ioctl = ll_file_ioctl,
4565         .open           = ll_file_open,
4566         .release        = ll_file_release,
4567         .mmap           = ll_file_mmap,
4568         .llseek         = ll_file_seek,
4569         .splice_read    = ll_file_splice_read,
4570         .fsync          = ll_fsync,
4571         .flush          = ll_flush,
4572         .flock          = ll_file_noflock,
4573         .lock           = ll_file_noflock
4574 };
4575
4576 struct inode_operations ll_file_inode_operations = {
4577         .setattr        = ll_setattr,
4578         .getattr        = ll_getattr,
4579         .permission     = ll_inode_permission,
4580 #ifdef HAVE_IOP_XATTR
4581         .setxattr       = ll_setxattr,
4582         .getxattr       = ll_getxattr,
4583         .removexattr    = ll_removexattr,
4584 #endif
4585         .listxattr      = ll_listxattr,
4586         .fiemap         = ll_fiemap,
4587 #ifdef HAVE_IOP_GET_ACL
4588         .get_acl        = ll_get_acl,
4589 #endif
4590 #ifdef HAVE_IOP_SET_ACL
4591         .set_acl        = ll_set_acl,
4592 #endif
4593 };
4594
4595 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4596 {
4597         struct ll_inode_info *lli = ll_i2info(inode);
4598         struct cl_object *obj = lli->lli_clob;
4599         struct lu_env *env;
4600         int rc;
4601         __u16 refcheck;
4602         ENTRY;
4603
4604         if (obj == NULL)
4605                 RETURN(0);
4606
4607         env = cl_env_get(&refcheck);
4608         if (IS_ERR(env))
4609                 RETURN(PTR_ERR(env));
4610
4611         rc = cl_conf_set(env, lli->lli_clob, conf);
4612         if (rc < 0)
4613                 GOTO(out, rc);
4614
4615         if (conf->coc_opc == OBJECT_CONF_SET) {
4616                 struct ldlm_lock *lock = conf->coc_lock;
4617                 struct cl_layout cl = {
4618                         .cl_layout_gen = 0,
4619                 };
4620
4621                 LASSERT(lock != NULL);
4622                 LASSERT(ldlm_has_layout(lock));
4623
4624                 /* it can only be allowed to match after layout is
4625                  * applied to inode otherwise false layout would be
4626                  * seen. Applying layout shoud happen before dropping
4627                  * the intent lock. */
4628                 ldlm_lock_allow_match(lock);
4629
4630                 rc = cl_object_layout_get(env, obj, &cl);
4631                 if (rc < 0)
4632                         GOTO(out, rc);
4633
4634                 CDEBUG(D_VFSTRACE,
4635                        DFID": layout version change: %u -> %u\n",
4636                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4637                        cl.cl_layout_gen);
4638                 ll_layout_version_set(lli, cl.cl_layout_gen);
4639         }
4640
4641 out:
4642         cl_env_put(env, &refcheck);
4643
4644         RETURN(rc);
4645 }
4646
4647 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4648 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4649
4650 {
4651         struct ll_sb_info *sbi = ll_i2sbi(inode);
4652         struct ptlrpc_request *req;
4653         struct mdt_body *body;
4654         void *lvbdata;
4655         void *lmm;
4656         int lmmsize;
4657         int rc;
4658         ENTRY;
4659
4660         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4661                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4662                lock->l_lvb_data, lock->l_lvb_len);
4663
4664         if (lock->l_lvb_data != NULL)
4665                 RETURN(0);
4666
4667         /* if layout lock was granted right away, the layout is returned
4668          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4669          * blocked and then granted via completion ast, we have to fetch
4670          * layout here. Please note that we can't use the LVB buffer in
4671          * completion AST because it doesn't have a large enough buffer */
4672         rc = ll_get_default_mdsize(sbi, &lmmsize);
4673         if (rc == 0)
4674                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4675                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4676                                 lmmsize, 0, &req);
4677         if (rc < 0)
4678                 RETURN(rc);
4679
4680         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4681         if (body == NULL)
4682                 GOTO(out, rc = -EPROTO);
4683
4684         lmmsize = body->mbo_eadatasize;
4685         if (lmmsize == 0) /* empty layout */
4686                 GOTO(out, rc = 0);
4687
4688         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4689         if (lmm == NULL)
4690                 GOTO(out, rc = -EFAULT);
4691
4692         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4693         if (lvbdata == NULL)
4694                 GOTO(out, rc = -ENOMEM);
4695
4696         memcpy(lvbdata, lmm, lmmsize);
4697         lock_res_and_lock(lock);
4698         if (unlikely(lock->l_lvb_data == NULL)) {
4699                 lock->l_lvb_type = LVB_T_LAYOUT;
4700                 lock->l_lvb_data = lvbdata;
4701                 lock->l_lvb_len = lmmsize;
4702                 lvbdata = NULL;
4703         }
4704         unlock_res_and_lock(lock);
4705
4706         if (lvbdata)
4707                 OBD_FREE_LARGE(lvbdata, lmmsize);
4708
4709         EXIT;
4710
4711 out:
4712         ptlrpc_req_finished(req);
4713         return rc;
4714 }
4715
4716 /**
4717  * Apply the layout to the inode. Layout lock is held and will be released
4718  * in this function.
4719  */
4720 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4721                               struct inode *inode)
4722 {
4723         struct ll_inode_info *lli = ll_i2info(inode);
4724         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4725         struct ldlm_lock *lock;
4726         struct cl_object_conf conf;
4727         int rc = 0;
4728         bool lvb_ready;
4729         bool wait_layout = false;
4730         ENTRY;
4731
4732         LASSERT(lustre_handle_is_used(lockh));
4733
4734         lock = ldlm_handle2lock(lockh);
4735         LASSERT(lock != NULL);
4736         LASSERT(ldlm_has_layout(lock));
4737
4738         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4739                    PFID(&lli->lli_fid), inode);
4740
4741         /* in case this is a caching lock and reinstate with new inode */
4742         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4743
4744         lock_res_and_lock(lock);
4745         lvb_ready = ldlm_is_lvb_ready(lock);
4746         unlock_res_and_lock(lock);
4747
4748         /* checking lvb_ready is racy but this is okay. The worst case is
4749          * that multi processes may configure the file on the same time. */
4750         if (lvb_ready)
4751                 GOTO(out, rc = 0);
4752
4753         rc = ll_layout_fetch(inode, lock);
4754         if (rc < 0)
4755                 GOTO(out, rc);
4756
4757         /* for layout lock, lmm is stored in lock's lvb.
4758          * lvb_data is immutable if the lock is held so it's safe to access it
4759          * without res lock.
4760          *
4761          * set layout to file. Unlikely this will fail as old layout was
4762          * surely eliminated */
4763         memset(&conf, 0, sizeof conf);
4764         conf.coc_opc = OBJECT_CONF_SET;
4765         conf.coc_inode = inode;
4766         conf.coc_lock = lock;
4767         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4768         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4769         rc = ll_layout_conf(inode, &conf);
4770
4771         /* refresh layout failed, need to wait */
4772         wait_layout = rc == -EBUSY;
4773         EXIT;
4774 out:
4775         LDLM_LOCK_PUT(lock);
4776         ldlm_lock_decref(lockh, mode);
4777
4778         /* wait for IO to complete if it's still being used. */
4779         if (wait_layout) {
4780                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4781                        ll_get_fsname(inode->i_sb, NULL, 0),
4782                        PFID(&lli->lli_fid), inode);
4783
4784                 memset(&conf, 0, sizeof conf);
4785                 conf.coc_opc = OBJECT_CONF_WAIT;
4786                 conf.coc_inode = inode;
4787                 rc = ll_layout_conf(inode, &conf);
4788                 if (rc == 0)
4789                         rc = -EAGAIN;
4790
4791                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4792                        ll_get_fsname(inode->i_sb, NULL, 0),
4793                        PFID(&lli->lli_fid), rc);
4794         }
4795         RETURN(rc);
4796 }
4797
4798 /**
4799  * Issue layout intent RPC to MDS.
4800  * \param inode [in]    file inode
4801  * \param intent [in]   layout intent
4802  *
4803  * \retval 0    on success
4804  * \retval < 0  error code
4805  */
4806 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4807 {
4808         struct ll_inode_info  *lli = ll_i2info(inode);
4809         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4810         struct md_op_data     *op_data;
4811         struct lookup_intent it;
4812         struct ptlrpc_request *req;
4813         int rc;
4814         ENTRY;
4815
4816         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4817                                      0, 0, LUSTRE_OPC_ANY, NULL);
4818         if (IS_ERR(op_data))
4819                 RETURN(PTR_ERR(op_data));
4820
4821         op_data->op_data = intent;
4822         op_data->op_data_size = sizeof(*intent);
4823
4824         memset(&it, 0, sizeof(it));
4825         it.it_op = IT_LAYOUT;
4826         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4827             intent->li_opc == LAYOUT_INTENT_TRUNC)
4828                 it.it_flags = FMODE_WRITE;
4829
4830         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4831                           ll_get_fsname(inode->i_sb, NULL, 0),
4832                           PFID(&lli->lli_fid), inode);
4833
4834         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4835                             &ll_md_blocking_ast, 0);
4836         if (it.it_request != NULL)
4837                 ptlrpc_req_finished(it.it_request);
4838         it.it_request = NULL;
4839
4840         ll_finish_md_op_data(op_data);
4841
4842         /* set lock data in case this is a new lock */
4843         if (!rc)
4844                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4845
4846         ll_intent_drop_lock(&it);
4847
4848         RETURN(rc);
4849 }
4850
4851 /**
4852  * This function checks if there exists a LAYOUT lock on the client side,
4853  * or enqueues it if it doesn't have one in cache.
4854  *
4855  * This function will not hold layout lock so it may be revoked any time after
4856  * this function returns. Any operations depend on layout should be redone
4857  * in that case.
4858  *
4859  * This function should be called before lov_io_init() to get an uptodate
4860  * layout version, the caller should save the version number and after IO
4861  * is finished, this function should be called again to verify that layout
4862  * is not changed during IO time.
4863  */
4864 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4865 {
4866         struct ll_inode_info    *lli = ll_i2info(inode);
4867         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4868         struct lustre_handle lockh;
4869         struct layout_intent intent = {
4870                 .li_opc = LAYOUT_INTENT_ACCESS,
4871         };
4872         enum ldlm_mode mode;
4873         int rc;
4874         ENTRY;
4875
4876         *gen = ll_layout_version_get(lli);
4877         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4878                 RETURN(0);
4879
4880         /* sanity checks */
4881         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4882         LASSERT(S_ISREG(inode->i_mode));
4883
4884         /* take layout lock mutex to enqueue layout lock exclusively. */
4885         mutex_lock(&lli->lli_layout_mutex);
4886
4887         while (1) {
4888                 /* mostly layout lock is caching on the local side, so try to
4889                  * match it before grabbing layout lock mutex. */
4890                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4891                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4892                 if (mode != 0) { /* hit cached lock */
4893                         rc = ll_layout_lock_set(&lockh, mode, inode);
4894                         if (rc == -EAGAIN)
4895                                 continue;
4896                         break;
4897                 }
4898
4899                 rc = ll_layout_intent(inode, &intent);
4900                 if (rc != 0)
4901                         break;
4902         }
4903
4904         if (rc == 0)
4905                 *gen = ll_layout_version_get(lli);
4906         mutex_unlock(&lli->lli_layout_mutex);
4907
4908         RETURN(rc);
4909 }
4910
4911 /**
4912  * Issue layout intent RPC indicating where in a file an IO is about to write.
4913  *
4914  * \param[in] inode     file inode.
4915  * \param[in] ext       write range with start offset of fille in bytes where
4916  *                      an IO is about to write, and exclusive end offset in
4917  *                      bytes.
4918  *
4919  * \retval 0    on success
4920  * \retval < 0  error code
4921  */
4922 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4923                            struct lu_extent *ext)
4924 {
4925         struct layout_intent intent = {
4926                 .li_opc = opc,
4927                 .li_extent.e_start = ext->e_start,
4928                 .li_extent.e_end = ext->e_end,
4929         };
4930         int rc;
4931         ENTRY;
4932
4933         rc = ll_layout_intent(inode, &intent);
4934
4935         RETURN(rc);
4936 }
4937
4938 /**
4939  *  This function send a restore request to the MDT
4940  */
4941 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4942 {
4943         struct hsm_user_request *hur;
4944         int                      len, rc;
4945         ENTRY;
4946
4947         len = sizeof(struct hsm_user_request) +
4948               sizeof(struct hsm_user_item);
4949         OBD_ALLOC(hur, len);
4950         if (hur == NULL)
4951                 RETURN(-ENOMEM);
4952
4953         hur->hur_request.hr_action = HUA_RESTORE;
4954         hur->hur_request.hr_archive_id = 0;
4955         hur->hur_request.hr_flags = 0;
4956         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4957                sizeof(hur->hur_user_item[0].hui_fid));
4958         hur->hur_user_item[0].hui_extent.offset = offset;
4959         hur->hur_user_item[0].hui_extent.length = length;
4960         hur->hur_request.hr_itemcount = 1;
4961         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
4962                            len, hur, NULL);
4963         OBD_FREE(hur, len);
4964         RETURN(rc);
4965 }