lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_MERGE:
 153                 /* merge blocks from the victim inode */
 154                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 155                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 156         case MDS_CLOSE_LAYOUT_SPLIT:
 157         case MDS_CLOSE_LAYOUT_SWAP: {
 158                 struct split_param *sp = data;
 159
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= bias;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 165                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 166                         op_data->op_mirror_id = sp->sp_mirror_id;
 167                 } else {
 168                         op_data->op_fid2 = *ll_inode2fid(data);
 169                 }
 170                 break;
 171         }
 172
 173         case MDS_CLOSE_RESYNC_DONE: {
 174                 struct ll_ioc_lease *ioc = data;
 175
 176                 LASSERT(data != NULL);
 177                 op_data->op_attr_blocks +=
 178                         ioc->lil_count * op_data->op_attr_blocks;
 179                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 180                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 181
 182                 op_data->op_lease_handle = och->och_lease_handle;
 183                 op_data->op_data = &ioc->lil_ids[0];
 184                 op_data->op_data_size =
 185                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 186                 break;
 187         }
 188
 189         case MDS_HSM_RELEASE:
 190                 LASSERT(data != NULL);
 191                 op_data->op_bias |= MDS_HSM_RELEASE;
 192                 op_data->op_data_version = *(__u64 *)data;
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 195                 break;
 196
 197         default:
 198                 LASSERT(data == NULL);
 199                 break;
 200         }
 201
 202         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 203                 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
 204         if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
 205                 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
 206
 207         rc = md_close(md_exp, op_data, och->och_mod, &req);
 208         if (rc != 0 && rc != -EINTR)
 209                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 210                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 211
 212         if (rc == 0 && op_data->op_bias & bias) {
 213                 struct mdt_body *body;
 214
 215                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 216                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 217                         rc = -EBUSY;
 218         }
 219
 220         ll_finish_md_op_data(op_data);
 221         EXIT;
 222 out:
 223
 224         md_clear_open_replay_data(md_exp, och);
 225         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 226         OBD_FREE_PTR(och);
 227
 228         ptlrpc_req_finished(req);       /* This is close request */
 229         return rc;
 230 }
 231
 232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 233 {
 234         struct ll_inode_info *lli = ll_i2info(inode);
 235         struct obd_client_handle **och_p;
 236         struct obd_client_handle *och;
 237         __u64 *och_usecount;
 238         int rc = 0;
 239         ENTRY;
 240
 241         if (fmode & FMODE_WRITE) {
 242                 och_p = &lli->lli_mds_write_och;
 243                 och_usecount = &lli->lli_open_fd_write_count;
 244         } else if (fmode & FMODE_EXEC) {
 245                 och_p = &lli->lli_mds_exec_och;
 246                 och_usecount = &lli->lli_open_fd_exec_count;
 247         } else {
 248                 LASSERT(fmode & FMODE_READ);
 249                 och_p = &lli->lli_mds_read_och;
 250                 och_usecount = &lli->lli_open_fd_read_count;
 251         }
 252
 253         mutex_lock(&lli->lli_och_mutex);
 254         if (*och_usecount > 0) {
 255                 /* There are still users of this handle, so skip
 256                  * freeing it. */
 257                 mutex_unlock(&lli->lli_och_mutex);
 258                 RETURN(0);
 259         }
 260
 261         och = *och_p;
 262         *och_p = NULL;
 263         mutex_unlock(&lli->lli_och_mutex);
 264
 265         if (och != NULL) {
 266                 /* There might be a race and this handle may already
 267                  * be closed. */
 268                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 269         }
 270
 271         RETURN(rc);
 272 }
 273
 274 static int ll_md_close(struct inode *inode, struct file *file)
 275 {
 276         union ldlm_policy_data policy = {
 277                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 278         };
 279         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 280         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 281         struct ll_inode_info *lli = ll_i2info(inode);
 282         struct lustre_handle lockh;
 283         enum ldlm_mode lockmode;
 284         int rc = 0;
 285         ENTRY;
 286
 287         /* clear group lock, if present */
 288         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 289                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 290
 291         if (fd->fd_lease_och != NULL) {
 292                 bool lease_broken;
 293
 294                 /* Usually the lease is not released when the
 295                  * application crashed, we need to release here. */
 296                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 297                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 298                         PFID(&lli->lli_fid), rc, lease_broken);
 299
 300                 fd->fd_lease_och = NULL;
 301         }
 302
 303         if (fd->fd_och != NULL) {
 304                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 305                 fd->fd_och = NULL;
 306                 GOTO(out, rc);
 307         }
 308
 309         /* Let's see if we have good enough OPEN lock on the file and if
 310            we can skip talking to MDS */
 311         mutex_lock(&lli->lli_och_mutex);
 312         if (fd->fd_omode & FMODE_WRITE) {
 313                 lockmode = LCK_CW;
 314                 LASSERT(lli->lli_open_fd_write_count);
 315                 lli->lli_open_fd_write_count--;
 316         } else if (fd->fd_omode & FMODE_EXEC) {
 317                 lockmode = LCK_PR;
 318                 LASSERT(lli->lli_open_fd_exec_count);
 319                 lli->lli_open_fd_exec_count--;
 320         } else {
 321                 lockmode = LCK_CR;
 322                 LASSERT(lli->lli_open_fd_read_count);
 323                 lli->lli_open_fd_read_count--;
 324         }
 325         mutex_unlock(&lli->lli_och_mutex);
 326
 327         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 328                            LDLM_IBITS, &policy, lockmode, &lockh))
 329                 rc = ll_md_real_close(inode, fd->fd_omode);
 330
 331 out:
 332         LUSTRE_FPRIVATE(file) = NULL;
 333         ll_file_data_put(fd);
 334
 335         RETURN(rc);
 336 }
 337
 338 /* While this returns an error code, fput() the caller does not, so we need
 339  * to make every effort to clean up all of our state here.  Also, applications
 340  * rarely check close errors and even if an error is returned they will not
 341  * re-try the close call.
 342  */
 343 int ll_file_release(struct inode *inode, struct file *file)
 344 {
 345         struct ll_file_data *fd;
 346         struct ll_sb_info *sbi = ll_i2sbi(inode);
 347         struct ll_inode_info *lli = ll_i2info(inode);
 348         int rc;
 349         ENTRY;
 350
 351         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 352                PFID(ll_inode2fid(inode)), inode);
 353
 354         if (inode->i_sb->s_root != file_dentry(file))
 355                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 356         fd = LUSTRE_FPRIVATE(file);
 357         LASSERT(fd != NULL);
 358
 359         /* The last ref on @file, maybe not the the owner pid of statahead,
 360          * because parent and child process can share the same file handle. */
 361         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 362                 ll_deauthorize_statahead(inode, fd);
 363
 364         if (inode->i_sb->s_root == file_dentry(file)) {
 365                 LUSTRE_FPRIVATE(file) = NULL;
 366                 ll_file_data_put(fd);
 367                 RETURN(0);
 368         }
 369
 370         if (!S_ISDIR(inode->i_mode)) {
 371                 if (lli->lli_clob != NULL)
 372                         lov_read_and_clear_async_rc(lli->lli_clob);
 373                 lli->lli_async_rc = 0;
 374         }
 375
 376         rc = ll_md_close(inode, file);
 377
 378         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 379                 libcfs_debug_dumplog();
 380
 381         RETURN(rc);
 382 }
 383
 384 static inline int ll_dom_readpage(void *data, struct page *page)
 385 {
 386         struct niobuf_local *lnb = data;
 387         void *kaddr;
 388
 389         kaddr = ll_kmap_atomic(page, KM_USER0);
 390         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 391         if (lnb->lnb_len < PAGE_SIZE)
 392                 memset(kaddr + lnb->lnb_len, 0,
 393                        PAGE_SIZE - lnb->lnb_len);
 394         flush_dcache_page(page);
 395         SetPageUptodate(page);
 396         ll_kunmap_atomic(kaddr, KM_USER0);
 397         unlock_page(page);
 398
 399         return 0;
 400 }
 401
 402 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 403                         struct lookup_intent *it)
 404 {
 405         struct ll_inode_info *lli = ll_i2info(inode);
 406         struct cl_object *obj = lli->lli_clob;
 407         struct address_space *mapping = inode->i_mapping;
 408         struct page *vmpage;
 409         struct niobuf_remote *rnb;
 410         char *data;
 411         struct lu_env *env;
 412         struct cl_io *io;
 413         __u16 refcheck;
 414         struct lustre_handle lockh;
 415         struct ldlm_lock *lock;
 416         unsigned long index, start;
 417         struct niobuf_local lnb;
 418         int rc;
 419         bool dom_lock = false;
 420
 421         ENTRY;
 422
 423         if (obj == NULL)
 424                 RETURN_EXIT;
 425
 426         if (it->it_lock_mode != 0) {
 427                 lockh.cookie = it->it_lock_handle;
 428                 lock = ldlm_handle2lock(&lockh);
 429                 if (lock != NULL)
 430                         dom_lock = ldlm_has_dom(lock);
 431                 LDLM_LOCK_PUT(lock);
 432         }
 433
 434         if (!dom_lock)
 435                 RETURN_EXIT;
 436
 437         env = cl_env_get(&refcheck);
 438         if (IS_ERR(env))
 439                 RETURN_EXIT;
 440
 441         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 442                                    RCL_SERVER))
 443                 GOTO(out_env, rc = -ENODATA);
 444
 445         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 446         data = (char *)rnb + sizeof(*rnb);
 447
 448         if (rnb == NULL || rnb->rnb_len == 0)
 449                 GOTO(out_env, rc = 0);
 450
 451         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 452                rnb->rnb_len, i_size_read(inode));
 453
 454         io = vvp_env_thread_io(env);
 455         io->ci_obj = obj;
 456         io->ci_ignore_layout = 1;
 457         rc = cl_io_init(env, io, CIT_MISC, obj);
 458         if (rc)
 459                 GOTO(out_io, rc);
 460
 461         lnb.lnb_file_offset = rnb->rnb_offset;
 462         start = lnb.lnb_file_offset / PAGE_SIZE;
 463         index = 0;
 464         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 465         lnb.lnb_page_offset = 0;
 466         do {
 467                 struct cl_page *clp;
 468
 469                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 470                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 471                 if (lnb.lnb_len > PAGE_SIZE)
 472                         lnb.lnb_len = PAGE_SIZE;
 473
 474                 vmpage = read_cache_page(mapping, index + start,
 475                                          ll_dom_readpage, &lnb);
 476                 if (IS_ERR(vmpage)) {
 477                         CWARN("%s: cannot fill page %lu for "DFID
 478                               " with data: rc = %li\n",
 479                               ll_get_fsname(inode->i_sb, NULL, 0),
 480                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 481                               PTR_ERR(vmpage));
 482                         break;
 483                 }
 484                 lock_page(vmpage);
 485                 clp = cl_page_find(env, obj, vmpage->index, vmpage,
 486                                    CPT_CACHEABLE);
 487                 if (IS_ERR(clp)) {
 488                         unlock_page(vmpage);
 489                         put_page(vmpage);
 490                         GOTO(out_io, rc = PTR_ERR(clp));
 491                 }
 492
 493                 /* export page */
 494                 cl_page_export(env, clp, 1);
 495                 cl_page_put(env, clp);
 496                 unlock_page(vmpage);
 497                 put_page(vmpage);
 498                 index++;
 499         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 500         rc = 0;
 501         EXIT;
 502 out_io:
 503         cl_io_fini(env, io);
 504 out_env:
 505         cl_env_put(env, &refcheck);
 506 }
 507
 508 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 509                                 struct lookup_intent *itp)
 510 {
 511         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 512         struct dentry *parent = de->d_parent;
 513         const char *name = NULL;
 514         int len = 0;
 515         struct md_op_data *op_data;
 516         struct ptlrpc_request *req = NULL;
 517         int rc;
 518         ENTRY;
 519
 520         LASSERT(parent != NULL);
 521         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 522
 523         /* if server supports open-by-fid, or file name is invalid, don't pack
 524          * name in open request */
 525         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 526             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 527                 name = de->d_name.name;
 528                 len = de->d_name.len;
 529         }
 530
 531         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 532                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 533         if (IS_ERR(op_data))
 534                 RETURN(PTR_ERR(op_data));
 535         op_data->op_data = lmm;
 536         op_data->op_data_size = lmmsize;
 537
 538         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 539                             &ll_md_blocking_ast, 0);
 540         ll_finish_md_op_data(op_data);
 541         if (rc == -ESTALE) {
 542                 /* reason for keep own exit path - don`t flood log
 543                  * with messages with -ESTALE errors.
 544                  */
 545                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 546                      it_open_error(DISP_OPEN_OPEN, itp))
 547                         GOTO(out, rc);
 548                 ll_release_openhandle(de, itp);
 549                 GOTO(out, rc);
 550         }
 551
 552         if (it_disposition(itp, DISP_LOOKUP_NEG))
 553                 GOTO(out, rc = -ENOENT);
 554
 555         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 556                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 557                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 558                 GOTO(out, rc);
 559         }
 560
 561         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 562
 563         if (!rc && itp->it_lock_mode) {
 564                 ll_dom_finish_open(de->d_inode, req, itp);
 565                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 566         }
 567
 568 out:
 569         ptlrpc_req_finished(req);
 570         ll_intent_drop_lock(itp);
 571
 572         /* We did open by fid, but by the time we got to the server,
 573          * the object disappeared. If this is a create, we cannot really
 574          * tell the userspace that the file it was trying to create
 575          * does not exist. Instead let's return -ESTALE, and the VFS will
 576          * retry the create with LOOKUP_REVAL that we are going to catch
 577          * in ll_revalidate_dentry() and use lookup then.
 578          */
 579         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 580                 rc = -ESTALE;
 581
 582         RETURN(rc);
 583 }
 584
 585 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 586                        struct obd_client_handle *och)
 587 {
 588         struct mdt_body *body;
 589
 590         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 591         och->och_fh = body->mbo_handle;
 592         och->och_fid = body->mbo_fid1;
 593         och->och_lease_handle.cookie = it->it_lock_handle;
 594         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 595         och->och_flags = it->it_flags;
 596
 597         return md_set_open_replay_data(md_exp, och, it);
 598 }
 599
 600 static int ll_local_open(struct file *file, struct lookup_intent *it,
 601                          struct ll_file_data *fd, struct obd_client_handle *och)
 602 {
 603         struct inode *inode = file_inode(file);
 604         ENTRY;
 605
 606         LASSERT(!LUSTRE_FPRIVATE(file));
 607
 608         LASSERT(fd != NULL);
 609
 610         if (och) {
 611                 int rc;
 612
 613                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 614                 if (rc != 0)
 615                         RETURN(rc);
 616         }
 617
 618         LUSTRE_FPRIVATE(file) = fd;
 619         ll_readahead_init(inode, &fd->fd_ras);
 620         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 621
 622         /* ll_cl_context initialize */
 623         rwlock_init(&fd->fd_lock);
 624         INIT_LIST_HEAD(&fd->fd_lccs);
 625
 626         RETURN(0);
 627 }
 628
 629 /* Open a file, and (for the very first open) create objects on the OSTs at
 630  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 631  * creation or open until ll_lov_setstripe() ioctl is called.
 632  *
 633  * If we already have the stripe MD locally then we don't request it in
 634  * md_open(), by passing a lmm_size = 0.
 635  *
 636  * It is up to the application to ensure no other processes open this file
 637  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 638  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 639  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 640  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 641  */
 642 int ll_file_open(struct inode *inode, struct file *file)
 643 {
 644         struct ll_inode_info *lli = ll_i2info(inode);
 645         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 646                                           .it_flags = file->f_flags };
 647         struct obd_client_handle **och_p = NULL;
 648         __u64 *och_usecount = NULL;
 649         struct ll_file_data *fd;
 650         int rc = 0;
 651         ENTRY;
 652
 653         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 654                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 655
 656         it = file->private_data; /* XXX: compat macro */
 657         file->private_data = NULL; /* prevent ll_local_open assertion */
 658
 659         fd = ll_file_data_get();
 660         if (fd == NULL)
 661                 GOTO(out_nofiledata, rc = -ENOMEM);
 662
 663         fd->fd_file = file;
 664         if (S_ISDIR(inode->i_mode))
 665                 ll_authorize_statahead(inode, fd);
 666
 667         if (inode->i_sb->s_root == file_dentry(file)) {
 668                 LUSTRE_FPRIVATE(file) = fd;
 669                 RETURN(0);
 670         }
 671
 672         if (!it || !it->it_disposition) {
 673                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 674                  * because everything but O_ACCMODE mask was stripped from
 675                  * there */
 676                 if ((oit.it_flags + 1) & O_ACCMODE)
 677                         oit.it_flags++;
 678                 if (file->f_flags & O_TRUNC)
 679                         oit.it_flags |= FMODE_WRITE;
 680
 681                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 682                  * dentry_open after call to open_namei that checks permissions.
 683                  * Only nfsd_open call dentry_open directly without checking
 684                  * permissions and because of that this code below is safe. */
 685                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 686                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 687
 688                 /* We do not want O_EXCL here, presumably we opened the file
 689                  * already? XXX - NFS implications? */
 690                 oit.it_flags &= ~O_EXCL;
 691
 692                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 693                  * created if necessary, then "IT_CREAT" should be set to keep
 694                  * consistent with it */
 695                 if (oit.it_flags & O_CREAT)
 696                         oit.it_op |= IT_CREAT;
 697
 698                 it = &oit;
 699         }
 700
 701 restart:
 702         /* Let's see if we have file open on MDS already. */
 703         if (it->it_flags & FMODE_WRITE) {
 704                 och_p = &lli->lli_mds_write_och;
 705                 och_usecount = &lli->lli_open_fd_write_count;
 706         } else if (it->it_flags & FMODE_EXEC) {
 707                 och_p = &lli->lli_mds_exec_och;
 708                 och_usecount = &lli->lli_open_fd_exec_count;
 709          } else {
 710                 och_p = &lli->lli_mds_read_och;
 711                 och_usecount = &lli->lli_open_fd_read_count;
 712         }
 713
 714         mutex_lock(&lli->lli_och_mutex);
 715         if (*och_p) { /* Open handle is present */
 716                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 717                         /* Well, there's extra open request that we do not need,
 718                            let's close it somehow. This will decref request. */
 719                         rc = it_open_error(DISP_OPEN_OPEN, it);
 720                         if (rc) {
 721                                 mutex_unlock(&lli->lli_och_mutex);
 722                                 GOTO(out_openerr, rc);
 723                         }
 724
 725                         ll_release_openhandle(file_dentry(file), it);
 726                 }
 727                 (*och_usecount)++;
 728
 729                 rc = ll_local_open(file, it, fd, NULL);
 730                 if (rc) {
 731                         (*och_usecount)--;
 732                         mutex_unlock(&lli->lli_och_mutex);
 733                         GOTO(out_openerr, rc);
 734                 }
 735         } else {
 736                 LASSERT(*och_usecount == 0);
 737                 if (!it->it_disposition) {
 738                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 739                         /* We cannot just request lock handle now, new ELC code
 740                            means that one of other OPEN locks for this file
 741                            could be cancelled, and since blocking ast handler
 742                            would attempt to grab och_mutex as well, that would
 743                            result in a deadlock */
 744                         mutex_unlock(&lli->lli_och_mutex);
 745                         /*
 746                          * Normally called under two situations:
 747                          * 1. NFS export.
 748                          * 2. A race/condition on MDS resulting in no open
 749                          *    handle to be returned from LOOKUP|OPEN request,
 750                          *    for example if the target entry was a symlink.
 751                          *
 752                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 753                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 754                          *  bit so that it's not confusing later callers.
 755                          *
 756                          *  NB; when ldd is NULL, it must have come via normal
 757                          *  lookup path only, since ll_iget_for_nfs always calls
 758                          *  ll_d_init().
 759                          */
 760                         if (ldd && ldd->lld_nfs_dentry) {
 761                                 ldd->lld_nfs_dentry = 0;
 762                                 it->it_flags |= MDS_OPEN_LOCK;
 763                         }
 764
 765                          /*
 766                          * Always specify MDS_OPEN_BY_FID because we don't want
 767                          * to get file with different fid.
 768                          */
 769                         it->it_flags |= MDS_OPEN_BY_FID;
 770                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 771                                                  it);
 772                         if (rc)
 773                                 GOTO(out_openerr, rc);
 774
 775                         goto restart;
 776                 }
 777                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 778                 if (!*och_p)
 779                         GOTO(out_och_free, rc = -ENOMEM);
 780
 781                 (*och_usecount)++;
 782
 783                 /* md_intent_lock() didn't get a request ref if there was an
 784                  * open error, so don't do cleanup on the request here
 785                  * (bug 3430) */
 786                 /* XXX (green): Should not we bail out on any error here, not
 787                  * just open error? */
 788                 rc = it_open_error(DISP_OPEN_OPEN, it);
 789                 if (rc != 0)
 790                         GOTO(out_och_free, rc);
 791
 792                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 793                          "inode %p: disposition %x, status %d\n", inode,
 794                          it_disposition(it, ~0), it->it_status);
 795
 796                 rc = ll_local_open(file, it, fd, *och_p);
 797                 if (rc)
 798                         GOTO(out_och_free, rc);
 799         }
 800         mutex_unlock(&lli->lli_och_mutex);
 801         fd = NULL;
 802
 803         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 804            different kind of OPEN lock for this same inode gets cancelled
 805            by ldlm_cancel_lru */
 806         if (!S_ISREG(inode->i_mode))
 807                 GOTO(out_och_free, rc);
 808
 809         cl_lov_delay_create_clear(&file->f_flags);
 810         GOTO(out_och_free, rc);
 811
 812 out_och_free:
 813         if (rc) {
 814                 if (och_p && *och_p) {
 815                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 816                         *och_p = NULL; /* OBD_FREE writes some magic there */
 817                         (*och_usecount)--;
 818                 }
 819                 mutex_unlock(&lli->lli_och_mutex);
 820
 821 out_openerr:
 822                 if (lli->lli_opendir_key == fd)
 823                         ll_deauthorize_statahead(inode, fd);
 824                 if (fd != NULL)
 825                         ll_file_data_put(fd);
 826         } else {
 827                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 828         }
 829
 830 out_nofiledata:
 831         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 832                 ptlrpc_req_finished(it->it_request);
 833                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 834         }
 835
 836         return rc;
 837 }
 838
 839 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 840                         struct ldlm_lock_desc *desc, void *data, int flag)
 841 {
 842         int rc;
 843         struct lustre_handle lockh;
 844         ENTRY;
 845
 846         switch (flag) {
 847         case LDLM_CB_BLOCKING:
 848                 ldlm_lock2handle(lock, &lockh);
 849                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 850                 if (rc < 0) {
 851                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 852                         RETURN(rc);
 853                 }
 854                 break;
 855         case LDLM_CB_CANCELING:
 856                 /* do nothing */
 857                 break;
 858         }
 859         RETURN(0);
 860 }
 861
 862 /**
 863  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 864  * and save it as fd->fd_och so as to force client to reopen the file even
 865  * if it has an open lock in cache already.
 866  */
 867 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 868                                 struct lustre_handle *old_handle)
 869 {
 870         struct ll_inode_info *lli = ll_i2info(inode);
 871         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 872         struct obd_client_handle **och_p;
 873         __u64 *och_usecount;
 874         int rc = 0;
 875         ENTRY;
 876
 877         /* Get the openhandle of the file */
 878         mutex_lock(&lli->lli_och_mutex);
 879         if (fd->fd_lease_och != NULL)
 880                 GOTO(out_unlock, rc = -EBUSY);
 881
 882         if (fd->fd_och == NULL) {
 883                 if (file->f_mode & FMODE_WRITE) {
 884                         LASSERT(lli->lli_mds_write_och != NULL);
 885                         och_p = &lli->lli_mds_write_och;
 886                         och_usecount = &lli->lli_open_fd_write_count;
 887                 } else {
 888                         LASSERT(lli->lli_mds_read_och != NULL);
 889                         och_p = &lli->lli_mds_read_och;
 890                         och_usecount = &lli->lli_open_fd_read_count;
 891                 }
 892
 893                 if (*och_usecount > 1)
 894                         GOTO(out_unlock, rc = -EBUSY);
 895
 896                 fd->fd_och = *och_p;
 897                 *och_usecount = 0;
 898                 *och_p = NULL;
 899         }
 900
 901         *old_handle = fd->fd_och->och_fh;
 902
 903         EXIT;
 904 out_unlock:
 905         mutex_unlock(&lli->lli_och_mutex);
 906         return rc;
 907 }
 908
 909 /**
 910  * Release ownership on lli_mds_*_och when putting back a file lease.
 911  */
 912 static int ll_lease_och_release(struct inode *inode, struct file *file)
 913 {
 914         struct ll_inode_info *lli = ll_i2info(inode);
 915         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 916         struct obd_client_handle **och_p;
 917         struct obd_client_handle *old_och = NULL;
 918         __u64 *och_usecount;
 919         int rc = 0;
 920         ENTRY;
 921
 922         mutex_lock(&lli->lli_och_mutex);
 923         if (file->f_mode & FMODE_WRITE) {
 924                 och_p = &lli->lli_mds_write_och;
 925                 och_usecount = &lli->lli_open_fd_write_count;
 926         } else {
 927                 och_p = &lli->lli_mds_read_och;
 928                 och_usecount = &lli->lli_open_fd_read_count;
 929         }
 930
 931         /* The file may have been open by another process (broken lease) so
 932          * *och_p is not NULL. In this case we should simply increase usecount
 933          * and close fd_och.
 934          */
 935         if (*och_p != NULL) {
 936                 old_och = fd->fd_och;
 937                 (*och_usecount)++;
 938         } else {
 939                 *och_p = fd->fd_och;
 940                 *och_usecount = 1;
 941         }
 942         fd->fd_och = NULL;
 943         mutex_unlock(&lli->lli_och_mutex);
 944
 945         if (old_och != NULL)
 946                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 947
 948         RETURN(rc);
 949 }
 950
 951 /**
 952  * Acquire a lease and open the file.
 953  */
 954 static struct obd_client_handle *
 955 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 956               __u64 open_flags)
 957 {
 958         struct lookup_intent it = { .it_op = IT_OPEN };
 959         struct ll_sb_info *sbi = ll_i2sbi(inode);
 960         struct md_op_data *op_data;
 961         struct ptlrpc_request *req = NULL;
 962         struct lustre_handle old_handle = { 0 };
 963         struct obd_client_handle *och = NULL;
 964         int rc;
 965         int rc2;
 966         ENTRY;
 967
 968         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 969                 RETURN(ERR_PTR(-EINVAL));
 970
 971         if (file != NULL) {
 972                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 973                         RETURN(ERR_PTR(-EPERM));
 974
 975                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 976                 if (rc)
 977                         RETURN(ERR_PTR(rc));
 978         }
 979
 980         OBD_ALLOC_PTR(och);
 981         if (och == NULL)
 982                 RETURN(ERR_PTR(-ENOMEM));
 983
 984         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 985                                         LUSTRE_OPC_ANY, NULL);
 986         if (IS_ERR(op_data))
 987                 GOTO(out, rc = PTR_ERR(op_data));
 988
 989         /* To tell the MDT this openhandle is from the same owner */
 990         op_data->op_handle = old_handle;
 991
 992         it.it_flags = fmode | open_flags;
 993         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 994         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 995                             &ll_md_blocking_lease_ast,
 996         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 997          * it can be cancelled which may mislead applications that the lease is
 998          * broken;
 999          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1000          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1001          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1002                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1003         ll_finish_md_op_data(op_data);
1004         ptlrpc_req_finished(req);
1005         if (rc < 0)
1006                 GOTO(out_release_it, rc);
1007
1008         if (it_disposition(&it, DISP_LOOKUP_NEG))
1009                 GOTO(out_release_it, rc = -ENOENT);
1010
1011         rc = it_open_error(DISP_OPEN_OPEN, &it);
1012         if (rc)
1013                 GOTO(out_release_it, rc);
1014
1015         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1016         ll_och_fill(sbi->ll_md_exp, &it, och);
1017
1018         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1019                 GOTO(out_close, rc = -EOPNOTSUPP);
1020
1021         /* already get lease, handle lease lock */
1022         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1023         if (it.it_lock_mode == 0 ||
1024             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1025                 /* open lock must return for lease */
1026                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1027                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1028                         it.it_lock_bits);
1029                 GOTO(out_close, rc = -EPROTO);
1030         }
1031
1032         ll_intent_release(&it);
1033         RETURN(och);
1034
1035 out_close:
1036         /* Cancel open lock */
1037         if (it.it_lock_mode != 0) {
1038                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1039                                             it.it_lock_mode);
1040                 it.it_lock_mode = 0;
1041                 och->och_lease_handle.cookie = 0ULL;
1042         }
1043         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1044         if (rc2 < 0)
1045                 CERROR("%s: error closing file "DFID": %d\n",
1046                        ll_get_fsname(inode->i_sb, NULL, 0),
1047                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1048         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1049 out_release_it:
1050         ll_intent_release(&it);
1051 out:
1052         if (och != NULL)
1053                 OBD_FREE_PTR(och);
1054         RETURN(ERR_PTR(rc));
1055 }
1056
1057 /**
1058  * Check whether a layout swap can be done between two inodes.
1059  *
1060  * \param[in] inode1  First inode to check
1061  * \param[in] inode2  Second inode to check
1062  *
1063  * \retval 0 on success, layout swap can be performed between both inodes
1064  * \retval negative error code if requirements are not met
1065  */
1066 static int ll_check_swap_layouts_validity(struct inode *inode1,
1067                                           struct inode *inode2)
1068 {
1069         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1070                 return -EINVAL;
1071
1072         if (inode_permission(inode1, MAY_WRITE) ||
1073             inode_permission(inode2, MAY_WRITE))
1074                 return -EPERM;
1075
1076         if (inode1->i_sb != inode2->i_sb)
1077                 return -EXDEV;
1078
1079         return 0;
1080 }
1081
1082 static int ll_swap_layouts_close(struct obd_client_handle *och,
1083                                  struct inode *inode, struct inode *inode2)
1084 {
1085         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1086         const struct lu_fid     *fid2;
1087         int                      rc;
1088         ENTRY;
1089
1090         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1091                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1092
1093         rc = ll_check_swap_layouts_validity(inode, inode2);
1094         if (rc < 0)
1095                 GOTO(out_free_och, rc);
1096
1097         /* We now know that inode2 is a lustre inode */
1098         fid2 = ll_inode2fid(inode2);
1099
1100         rc = lu_fid_cmp(fid1, fid2);
1101         if (rc == 0)
1102                 GOTO(out_free_och, rc = -EINVAL);
1103
1104         /* Close the file and {swap,merge} layouts between inode & inode2.
1105          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1106          * because we still need it to pack l_remote_handle to MDT. */
1107         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1108                                        inode2);
1109
1110         och = NULL; /* freed in ll_close_inode_openhandle() */
1111
1112 out_free_och:
1113         if (och != NULL)
1114                 OBD_FREE_PTR(och);
1115
1116         RETURN(rc);
1117 }
1118
1119 /**
1120  * Release lease and close the file.
1121  * It will check if the lease has ever broken.
1122  */
1123 static int ll_lease_close_intent(struct obd_client_handle *och,
1124                                  struct inode *inode,
1125                                  bool *lease_broken, enum mds_op_bias bias,
1126                                  void *data)
1127 {
1128         struct ldlm_lock *lock;
1129         bool cancelled = true;
1130         int rc;
1131         ENTRY;
1132
1133         lock = ldlm_handle2lock(&och->och_lease_handle);
1134         if (lock != NULL) {
1135                 lock_res_and_lock(lock);
1136                 cancelled = ldlm_is_cancel(lock);
1137                 unlock_res_and_lock(lock);
1138                 LDLM_LOCK_PUT(lock);
1139         }
1140
1141         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1142                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1143
1144         if (lease_broken != NULL)
1145                 *lease_broken = cancelled;
1146
1147         if (!cancelled && !bias)
1148                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1149
1150         if (cancelled) { /* no need to excute intent */
1151                 bias = 0;
1152                 data = NULL;
1153         }
1154
1155         rc = ll_close_inode_openhandle(inode, och, bias, data);
1156         RETURN(rc);
1157 }
1158
1159 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1160                           bool *lease_broken)
1161 {
1162         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1163 }
1164
1165 /**
1166  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1167  */
1168 static int ll_lease_file_resync(struct obd_client_handle *och,
1169                                 struct inode *inode)
1170 {
1171         struct ll_sb_info *sbi = ll_i2sbi(inode);
1172         struct md_op_data *op_data;
1173         __u64 data_version_unused;
1174         int rc;
1175         ENTRY;
1176
1177         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1178                                      LUSTRE_OPC_ANY, NULL);
1179         if (IS_ERR(op_data))
1180                 RETURN(PTR_ERR(op_data));
1181
1182         /* before starting file resync, it's necessary to clean up page cache
1183          * in client memory, otherwise once the layout version is increased,
1184          * writing back cached data will be denied the OSTs. */
1185         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1186         if (rc)
1187                 GOTO(out, rc);
1188
1189         op_data->op_handle = och->och_lease_handle;
1190         rc = md_file_resync(sbi->ll_md_exp, op_data);
1191         if (rc)
1192                 GOTO(out, rc);
1193
1194         EXIT;
1195 out:
1196         ll_finish_md_op_data(op_data);
1197         return rc;
1198 }
1199
1200 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1201 {
1202         struct ll_inode_info *lli = ll_i2info(inode);
1203         struct cl_object *obj = lli->lli_clob;
1204         struct cl_attr *attr = vvp_env_thread_attr(env);
1205         s64 atime;
1206         s64 mtime;
1207         s64 ctime;
1208         int rc = 0;
1209
1210         ENTRY;
1211
1212         ll_inode_size_lock(inode);
1213
1214         /* Merge timestamps the most recently obtained from MDS with
1215          * timestamps obtained from OSTs.
1216          *
1217          * Do not overwrite atime of inode because it may be refreshed
1218          * by file_accessed() function. If the read was served by cache
1219          * data, there is no RPC to be sent so that atime may not be
1220          * transferred to OSTs at all. MDT only updates atime at close time
1221          * if it's at least 'mdd.*.atime_diff' older.
1222          * All in all, the atime in Lustre does not strictly comply with
1223          * POSIX. Solving this problem needs to send an RPC to MDT for each
1224          * read, this will hurt performance. */
1225         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1226                 LTIME_S(inode->i_atime) = lli->lli_atime;
1227                 lli->lli_update_atime = 0;
1228         }
1229         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1230         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1231
1232         atime = LTIME_S(inode->i_atime);
1233         mtime = LTIME_S(inode->i_mtime);
1234         ctime = LTIME_S(inode->i_ctime);
1235
1236         cl_object_attr_lock(obj);
1237         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1238                 rc = -EINVAL;
1239         else
1240                 rc = cl_object_attr_get(env, obj, attr);
1241         cl_object_attr_unlock(obj);
1242
1243         if (rc != 0)
1244                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1245
1246         if (atime < attr->cat_atime)
1247                 atime = attr->cat_atime;
1248
1249         if (ctime < attr->cat_ctime)
1250                 ctime = attr->cat_ctime;
1251
1252         if (mtime < attr->cat_mtime)
1253                 mtime = attr->cat_mtime;
1254
1255         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1256                PFID(&lli->lli_fid), attr->cat_size);
1257
1258         i_size_write(inode, attr->cat_size);
1259         inode->i_blocks = attr->cat_blocks;
1260
1261         LTIME_S(inode->i_atime) = atime;
1262         LTIME_S(inode->i_mtime) = mtime;
1263         LTIME_S(inode->i_ctime) = ctime;
1264
1265 out_size_unlock:
1266         ll_inode_size_unlock(inode);
1267
1268         RETURN(rc);
1269 }
1270
1271 /**
1272  * Set designated mirror for I/O.
1273  *
1274  * So far only read, write, and truncated can support to issue I/O to
1275  * designated mirror.
1276  */
1277 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1278 {
1279         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1280
1281         /* clear layout version for generic(non-resync) I/O in case it carries
1282          * stale layout version due to I/O restart */
1283         io->ci_layout_version = 0;
1284
1285         /* FLR: disable non-delay for designated mirror I/O because obviously
1286          * only one mirror is available */
1287         if (fd->fd_designated_mirror > 0) {
1288                 io->ci_ndelay = 0;
1289                 io->ci_designated_mirror = fd->fd_designated_mirror;
1290                 io->ci_layout_version = fd->fd_layout_version;
1291                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1292                                  * io to ptasks */
1293         }
1294
1295         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1296                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1297 }
1298
1299 static bool file_is_noatime(const struct file *file)
1300 {
1301         const struct vfsmount *mnt = file->f_path.mnt;
1302         const struct inode *inode = file_inode((struct file *)file);
1303
1304         /* Adapted from file_accessed() and touch_atime().*/
1305         if (file->f_flags & O_NOATIME)
1306                 return true;
1307
1308         if (inode->i_flags & S_NOATIME)
1309                 return true;
1310
1311         if (IS_NOATIME(inode))
1312                 return true;
1313
1314         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1315                 return true;
1316
1317         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1318                 return true;
1319
1320         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1321                 return true;
1322
1323         return false;
1324 }
1325
1326 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1327
1328 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1329 {
1330         struct inode *inode = file_inode(file);
1331         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1332
1333         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1334         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1335         io->u.ci_rw.rw_file = file;
1336         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1337         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1338         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1339
1340         if (iot == CIT_WRITE) {
1341                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1342                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1343                                            file->f_flags & O_DIRECT ||
1344                                            IS_SYNC(inode));
1345         }
1346         io->ci_obj = ll_i2info(inode)->lli_clob;
1347         io->ci_lockreq = CILR_MAYBE;
1348         if (ll_file_nolock(file)) {
1349                 io->ci_lockreq = CILR_NEVER;
1350                 io->ci_no_srvlock = 1;
1351         } else if (file->f_flags & O_APPEND) {
1352                 io->ci_lockreq = CILR_MANDATORY;
1353         }
1354         io->ci_noatime = file_is_noatime(file);
1355         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1356                 io->ci_pio = !io->u.ci_rw.rw_append;
1357         else
1358                 io->ci_pio = 0;
1359
1360         /* FLR: only use non-delay I/O for read as there is only one
1361          * avaliable mirror for write. */
1362         io->ci_ndelay = !(iot == CIT_WRITE);
1363
1364         ll_io_set_mirror(io, file);
1365 }
1366
1367 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1368 {
1369         struct cl_io_pt *pt = ptask->pt_cbdata;
1370         struct file *file = pt->cip_file;
1371         struct lu_env *env;
1372         struct cl_io *io;
1373         loff_t pos = pt->cip_pos;
1374         int rc;
1375         __u16 refcheck;
1376         ENTRY;
1377
1378         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1379                 file_dentry(file)->d_name.name,
1380                 pt->cip_iot == CIT_READ ? "read" : "write",
1381                 pos, pos + pt->cip_count);
1382
1383         env = cl_env_get(&refcheck);
1384         if (IS_ERR(env))
1385                 RETURN(PTR_ERR(env));
1386
1387         io = vvp_env_thread_io(env);
1388         ll_io_init(io, file, pt->cip_iot);
1389         io->u.ci_rw.rw_iter = pt->cip_iter;
1390         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1391         io->ci_pio = 0; /* It's already in parallel task */
1392
1393         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1394                            pt->cip_count - pt->cip_result);
1395         if (!rc) {
1396                 struct vvp_io *vio = vvp_env_io(env);
1397
1398                 vio->vui_io_subtype = IO_NORMAL;
1399                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1400
1401                 ll_cl_add(file, env, io, LCC_RW);
1402                 rc = cl_io_loop(env, io);
1403                 ll_cl_remove(file, env);
1404         } else {
1405                 /* cl_io_rw_init() handled IO */
1406                 rc = io->ci_result;
1407         }
1408
1409         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1410                 if (io->ci_nob > 0)
1411                         io->ci_nob /= 2;
1412                 rc = -EIO;
1413         }
1414
1415         if (io->ci_nob > 0) {
1416                 pt->cip_result += io->ci_nob;
1417                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1418                 pos += io->ci_nob;
1419                 pt->cip_iocb.ki_pos = pos;
1420 #ifdef HAVE_KIOCB_KI_LEFT
1421                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1422 #elif defined(HAVE_KI_NBYTES)
1423                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1424 #endif
1425         }
1426
1427         cl_io_fini(env, io);
1428         cl_env_put(env, &refcheck);
1429
1430         pt->cip_need_restart = io->ci_need_restart;
1431
1432         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1433                 file_dentry(file)->d_name.name,
1434                 pt->cip_iot == CIT_READ ? "read" : "write",
1435                 pt->cip_result, rc);
1436
1437         RETURN(pt->cip_result > 0 ? 0 : rc);
1438 }
1439
1440 static ssize_t
1441 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1442                    struct file *file, enum cl_io_type iot,
1443                    loff_t *ppos, size_t count)
1444 {
1445         struct range_lock       range;
1446         struct vvp_io           *vio = vvp_env_io(env);
1447         struct inode            *inode = file_inode(file);
1448         struct ll_inode_info    *lli = ll_i2info(inode);
1449         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1450         struct cl_io            *io;
1451         loff_t                  pos = *ppos;
1452         ssize_t                 result = 0;
1453         int                     rc = 0;
1454         unsigned                retried = 0;
1455         bool                    restarted = false;
1456
1457         ENTRY;
1458
1459         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1460                 file_dentry(file)->d_name.name,
1461                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1462
1463 restart:
1464         io = vvp_env_thread_io(env);
1465         ll_io_init(io, file, iot);
1466         if (args->via_io_subtype == IO_NORMAL) {
1467                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1468                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1469         }
1470         if (args->via_io_subtype != IO_NORMAL || restarted)
1471                 io->ci_pio = 0;
1472         io->ci_ndelay_tried = retried;
1473
1474         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1475                 bool range_locked = false;
1476
1477                 if (file->f_flags & O_APPEND)
1478                         range_lock_init(&range, 0, LUSTRE_EOF);
1479                 else
1480                         range_lock_init(&range, pos, pos + count - 1);
1481
1482                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1483                 vio->vui_io_subtype = args->via_io_subtype;
1484
1485                 switch (vio->vui_io_subtype) {
1486                 case IO_NORMAL:
1487                         /* Direct IO reads must also take range lock,
1488                          * or multiple reads will try to work on the same pages
1489                          * See LU-6227 for details. */
1490                         if (((iot == CIT_WRITE) ||
1491                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1492                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1493                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1494                                        RL_PARA(&range));
1495                                 rc = range_lock(&lli->lli_write_tree, &range);
1496                                 if (rc < 0)
1497                                         GOTO(out, rc);
1498
1499                                 range_locked = true;
1500                         }
1501                         break;
1502                 case IO_SPLICE:
1503                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1504                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1505                         break;
1506                 default:
1507                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1508                         LBUG();
1509                 }
1510
1511                 ll_cl_add(file, env, io, LCC_RW);
1512                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1513                     !lli->lli_inode_locked) {
1514                         inode_lock(inode);
1515                         lli->lli_inode_locked = 1;
1516                 }
1517                 rc = cl_io_loop(env, io);
1518                 if (lli->lli_inode_locked) {
1519                         lli->lli_inode_locked = 0;
1520                         inode_unlock(inode);
1521                 }
1522                 ll_cl_remove(file, env);
1523
1524                 if (range_locked) {
1525                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1526                                RL_PARA(&range));
1527                         range_unlock(&lli->lli_write_tree, &range);
1528                 }
1529         } else {
1530                 /* cl_io_rw_init() handled IO */
1531                 rc = io->ci_result;
1532         }
1533
1534         if (io->ci_nob > 0) {
1535                 result += io->ci_nob;
1536                 count  -= io->ci_nob;
1537
1538                 if (args->via_io_subtype == IO_NORMAL) {
1539                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1540
1541                         /* CLIO is too complicated. See LU-11069. */
1542                         if (cl_io_is_append(io))
1543                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1544                         else
1545                                 pos += io->ci_nob;
1546
1547                         args->u.normal.via_iocb->ki_pos = pos;
1548 #ifdef HAVE_KIOCB_KI_LEFT
1549                         args->u.normal.via_iocb->ki_left = count;
1550 #elif defined(HAVE_KI_NBYTES)
1551                         args->u.normal.via_iocb->ki_nbytes = count;
1552 #endif
1553                 } else {
1554                         /* for splice */
1555                         pos = io->u.ci_rw.rw_range.cir_pos;
1556                 }
1557         }
1558 out:
1559         cl_io_fini(env, io);
1560
1561         CDEBUG(D_VFSTRACE,
1562                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1563                file->f_path.dentry->d_name.name,
1564                iot, rc, result, io->ci_need_restart);
1565
1566         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1567                 CDEBUG(D_VFSTRACE,
1568                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1569                         file_dentry(file)->d_name.name,
1570                         iot == CIT_READ ? "read" : "write",
1571                         pos, pos + count, result, rc);
1572                 /* preserve the tried count for FLR */
1573                 retried = io->ci_ndelay_tried;
1574                 restarted = true;
1575                 goto restart;
1576         }
1577
1578         if (iot == CIT_READ) {
1579                 if (result > 0)
1580                         ll_stats_ops_tally(ll_i2sbi(inode),
1581                                            LPROC_LL_READ_BYTES, result);
1582         } else if (iot == CIT_WRITE) {
1583                 if (result > 0) {
1584                         ll_stats_ops_tally(ll_i2sbi(inode),
1585                                            LPROC_LL_WRITE_BYTES, result);
1586                         fd->fd_write_failed = false;
1587                 } else if (result == 0 && rc == 0) {
1588                         rc = io->ci_result;
1589                         if (rc < 0)
1590                                 fd->fd_write_failed = true;
1591                         else
1592                                 fd->fd_write_failed = false;
1593                 } else if (rc != -ERESTARTSYS) {
1594                         fd->fd_write_failed = true;
1595                 }
1596         }
1597
1598         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1599                 file_dentry(file)->d_name.name,
1600                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1601
1602         *ppos = pos;
1603
1604         RETURN(result > 0 ? result : rc);
1605 }
1606
1607 /**
1608  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1609  * especially for small I/O.
1610  *
1611  * To serve a read request, CLIO has to create and initialize a cl_io and
1612  * then request DLM lock. This has turned out to have siginificant overhead
1613  * and affects the performance of small I/O dramatically.
1614  *
1615  * It's not necessary to create a cl_io for each I/O. Under the help of read
1616  * ahead, most of the pages being read are already in memory cache and we can
1617  * read those pages directly because if the pages exist, the corresponding DLM
1618  * lock must exist so that page content must be valid.
1619  *
1620  * In fast read implementation, the llite speculatively finds and reads pages
1621  * in memory cache. There are three scenarios for fast read:
1622  *   - If the page exists and is uptodate, kernel VM will provide the data and
1623  *     CLIO won't be intervened;
1624  *   - If the page was brought into memory by read ahead, it will be exported
1625  *     and read ahead parameters will be updated;
1626  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1627  *     it will go back and invoke normal read, i.e., a cl_io will be created
1628  *     and DLM lock will be requested.
1629  *
1630  * POSIX compliance: posix standard states that read is intended to be atomic.
1631  * Lustre read implementation is in line with Linux kernel read implementation
1632  * and neither of them complies with POSIX standard in this matter. Fast read
1633  * doesn't make the situation worse on single node but it may interleave write
1634  * results from multiple nodes due to short read handling in ll_file_aio_read().
1635  *
1636  * \param env - lu_env
1637  * \param iocb - kiocb from kernel
1638  * \param iter - user space buffers where the data will be copied
1639  *
1640  * \retval - number of bytes have been read, or error code if error occurred.
1641  */
1642 static ssize_t
1643 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1644 {
1645         ssize_t result;
1646
1647         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1648                 return 0;
1649
1650         /* NB: we can't do direct IO for fast read because it will need a lock
1651          * to make IO engine happy. */
1652         if (iocb->ki_filp->f_flags & O_DIRECT)
1653                 return 0;
1654
1655         result = generic_file_read_iter(iocb, iter);
1656
1657         /* If the first page is not in cache, generic_file_aio_read() will be
1658          * returned with -ENODATA.
1659          * See corresponding code in ll_readpage(). */
1660         if (result == -ENODATA)
1661                 result = 0;
1662
1663         if (result > 0)
1664                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1665                                 LPROC_LL_READ_BYTES, result);
1666
1667         return result;
1668 }
1669
1670 /*
1671  * Read from a file (through the page cache).
1672  */
1673 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1674 {
1675         struct lu_env *env;
1676         struct vvp_io_args *args;
1677         ssize_t result;
1678         ssize_t rc2;
1679         __u16 refcheck;
1680
1681         result = ll_do_fast_read(iocb, to);
1682         if (result < 0 || iov_iter_count(to) == 0)
1683                 GOTO(out, result);
1684
1685         env = cl_env_get(&refcheck);
1686         if (IS_ERR(env))
1687                 return PTR_ERR(env);
1688
1689         args = ll_env_args(env, IO_NORMAL);
1690         args->u.normal.via_iter = to;
1691         args->u.normal.via_iocb = iocb;
1692
1693         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1694                                  &iocb->ki_pos, iov_iter_count(to));
1695         if (rc2 > 0)
1696                 result += rc2;
1697         else if (result == 0)
1698                 result = rc2;
1699
1700         cl_env_put(env, &refcheck);
1701 out:
1702         return result;
1703 }
1704
1705 /**
1706  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1707  * If a page is already in the page cache and dirty (and some other things -
1708  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1709  * write to it without doing a full I/O, because Lustre already knows about it
1710  * and will write it out.  This saves a lot of processing time.
1711  *
1712  * All writes here are within one page, so exclusion is handled by the page
1713  * lock on the vm page.  We do not do tiny writes for writes which touch
1714  * multiple pages because it's very unlikely multiple sequential pages are
1715  * are already dirty.
1716  *
1717  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1718  * and are unlikely to be to already dirty pages.
1719  *
1720  * Attribute updates are important here, we do them in ll_tiny_write_end.
1721  */
1722 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1723 {
1724         ssize_t count = iov_iter_count(iter);
1725         struct file *file = iocb->ki_filp;
1726         struct inode *inode = file_inode(file);
1727         ssize_t result = 0;
1728
1729         ENTRY;
1730
1731         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1732          * of function for why.
1733          */
1734         if (count >= PAGE_SIZE ||
1735             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1736                 RETURN(0);
1737
1738         result = __generic_file_write_iter(iocb, iter);
1739
1740         /* If the page is not already dirty, ll_tiny_write_begin returns
1741          * -ENODATA.  We continue on to normal write.
1742          */
1743         if (result == -ENODATA)
1744                 result = 0;
1745
1746         if (result > 0) {
1747                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1748                                    result);
1749                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1750         }
1751
1752         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1753
1754         RETURN(result);
1755 }
1756
1757 /*
1758  * Write to a file (through the page cache).
1759  */
1760 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1761 {
1762         struct vvp_io_args *args;
1763         struct lu_env *env;
1764         ssize_t rc_tiny = 0, rc_normal;
1765         __u16 refcheck;
1766
1767         ENTRY;
1768
1769         /* NB: we can't do direct IO for tiny writes because they use the page
1770          * cache, we can't do sync writes because tiny writes can't flush
1771          * pages, and we can't do append writes because we can't guarantee the
1772          * required DLM locks are held to protect file size.
1773          */
1774         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1775             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1776                 rc_tiny = ll_do_tiny_write(iocb, from);
1777
1778         /* In case of error, go on and try normal write - Only stop if tiny
1779          * write completed I/O.
1780          */
1781         if (iov_iter_count(from) == 0)
1782                 GOTO(out, rc_normal = rc_tiny);
1783
1784         env = cl_env_get(&refcheck);
1785         if (IS_ERR(env))
1786                 return PTR_ERR(env);
1787
1788         args = ll_env_args(env, IO_NORMAL);
1789         args->u.normal.via_iter = from;
1790         args->u.normal.via_iocb = iocb;
1791
1792         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1793                                     &iocb->ki_pos, iov_iter_count(from));
1794
1795         /* On success, combine bytes written. */
1796         if (rc_tiny >= 0 && rc_normal > 0)
1797                 rc_normal += rc_tiny;
1798         /* On error, only return error from normal write if tiny write did not
1799          * write any bytes.  Otherwise return bytes written by tiny write.
1800          */
1801         else if (rc_tiny > 0)
1802                 rc_normal = rc_tiny;
1803
1804         cl_env_put(env, &refcheck);
1805 out:
1806         RETURN(rc_normal);
1807 }
1808
1809 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1810 /*
1811  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1812  */
1813 static int ll_file_get_iov_count(const struct iovec *iov,
1814                                  unsigned long *nr_segs, size_t *count)
1815 {
1816         size_t cnt = 0;
1817         unsigned long seg;
1818
1819         for (seg = 0; seg < *nr_segs; seg++) {
1820                 const struct iovec *iv = &iov[seg];
1821
1822                 /*
1823                  * If any segment has a negative length, or the cumulative
1824                  * length ever wraps negative then return -EINVAL.
1825                  */
1826                 cnt += iv->iov_len;
1827                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1828                         return -EINVAL;
1829                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1830                         continue;
1831                 if (seg == 0)
1832                         return -EFAULT;
1833                 *nr_segs = seg;
1834                 cnt -= iv->iov_len;     /* This segment is no good */
1835                 break;
1836         }
1837         *count = cnt;
1838         return 0;
1839 }
1840
1841 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1842                                 unsigned long nr_segs, loff_t pos)
1843 {
1844         struct iov_iter to;
1845         size_t iov_count;
1846         ssize_t result;
1847         ENTRY;
1848
1849         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1850         if (result)
1851                 RETURN(result);
1852
1853 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1854         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1855 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1856         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1857 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1858
1859         result = ll_file_read_iter(iocb, &to);
1860
1861         RETURN(result);
1862 }
1863
1864 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1865                             loff_t *ppos)
1866 {
1867         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1868         struct kiocb   kiocb;
1869         ssize_t        result;
1870         ENTRY;
1871
1872         init_sync_kiocb(&kiocb, file);
1873         kiocb.ki_pos = *ppos;
1874 #ifdef HAVE_KIOCB_KI_LEFT
1875         kiocb.ki_left = count;
1876 #elif defined(HAVE_KI_NBYTES)
1877         kiocb.i_nbytes = count;
1878 #endif
1879
1880         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1881         *ppos = kiocb.ki_pos;
1882
1883         RETURN(result);
1884 }
1885
1886 /*
1887  * Write to a file (through the page cache).
1888  * AIO stuff
1889  */
1890 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1891                                  unsigned long nr_segs, loff_t pos)
1892 {
1893         struct iov_iter from;
1894         size_t iov_count;
1895         ssize_t result;
1896         ENTRY;
1897
1898         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1899         if (result)
1900                 RETURN(result);
1901
1902 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1903         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1904 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1905         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1906 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1907
1908         result = ll_file_write_iter(iocb, &from);
1909
1910         RETURN(result);
1911 }
1912
1913 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1914                              size_t count, loff_t *ppos)
1915 {
1916         struct iovec   iov = { .iov_base = (void __user *)buf,
1917                                .iov_len = count };
1918         struct kiocb   kiocb;
1919         ssize_t        result;
1920
1921         ENTRY;
1922
1923         init_sync_kiocb(&kiocb, file);
1924         kiocb.ki_pos = *ppos;
1925 #ifdef HAVE_KIOCB_KI_LEFT
1926         kiocb.ki_left = count;
1927 #elif defined(HAVE_KI_NBYTES)
1928         kiocb.ki_nbytes = count;
1929 #endif
1930
1931         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1932         *ppos = kiocb.ki_pos;
1933
1934         RETURN(result);
1935 }
1936 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1937
1938 /*
1939  * Send file content (through pagecache) somewhere with helper
1940  */
1941 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1942                                    struct pipe_inode_info *pipe, size_t count,
1943                                    unsigned int flags)
1944 {
1945         struct lu_env      *env;
1946         struct vvp_io_args *args;
1947         ssize_t             result;
1948         __u16               refcheck;
1949         ENTRY;
1950
1951         env = cl_env_get(&refcheck);
1952         if (IS_ERR(env))
1953                 RETURN(PTR_ERR(env));
1954
1955         args = ll_env_args(env, IO_SPLICE);
1956         args->u.splice.via_pipe = pipe;
1957         args->u.splice.via_flags = flags;
1958
1959         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1960         cl_env_put(env, &refcheck);
1961         RETURN(result);
1962 }
1963
1964 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1965                              __u64 flags, struct lov_user_md *lum, int lum_size)
1966 {
1967         struct lookup_intent oit = {
1968                 .it_op = IT_OPEN,
1969                 .it_flags = flags | MDS_OPEN_BY_FID,
1970         };
1971         int rc;
1972         ENTRY;
1973
1974         ll_inode_size_lock(inode);
1975         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1976         if (rc < 0)
1977                 GOTO(out_unlock, rc);
1978
1979         ll_release_openhandle(dentry, &oit);
1980
1981 out_unlock:
1982         ll_inode_size_unlock(inode);
1983         ll_intent_release(&oit);
1984
1985         RETURN(rc);
1986 }
1987
1988 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1989                              struct lov_mds_md **lmmp, int *lmm_size,
1990                              struct ptlrpc_request **request)
1991 {
1992         struct ll_sb_info *sbi = ll_i2sbi(inode);
1993         struct mdt_body  *body;
1994         struct lov_mds_md *lmm = NULL;
1995         struct ptlrpc_request *req = NULL;
1996         struct md_op_data *op_data;
1997         int rc, lmmsize;
1998
1999         rc = ll_get_default_mdsize(sbi, &lmmsize);
2000         if (rc)
2001                 RETURN(rc);
2002
2003         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2004                                      strlen(filename), lmmsize,
2005                                      LUSTRE_OPC_ANY, NULL);
2006         if (IS_ERR(op_data))
2007                 RETURN(PTR_ERR(op_data));
2008
2009         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2010         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2011         ll_finish_md_op_data(op_data);
2012         if (rc < 0) {
2013                 CDEBUG(D_INFO, "md_getattr_name failed "
2014                        "on %s: rc %d\n", filename, rc);
2015                 GOTO(out, rc);
2016         }
2017
2018         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2019         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2020
2021         lmmsize = body->mbo_eadatasize;
2022
2023         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2024                         lmmsize == 0) {
2025                 GOTO(out, rc = -ENODATA);
2026         }
2027
2028         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2029         LASSERT(lmm != NULL);
2030
2031         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2032             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2033             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2034                 GOTO(out, rc = -EPROTO);
2035
2036         /*
2037          * This is coming from the MDS, so is probably in
2038          * little endian.  We convert it to host endian before
2039          * passing it to userspace.
2040          */
2041         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2042                 int stripe_count;
2043
2044                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2045                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2046                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2047                         if (le32_to_cpu(lmm->lmm_pattern) &
2048                             LOV_PATTERN_F_RELEASED)
2049                                 stripe_count = 0;
2050                 }
2051
2052                 /* if function called for directory - we should
2053                  * avoid swab not existent lsm objects */
2054                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2055                         lustre_swab_lov_user_md_v1(
2056                                         (struct lov_user_md_v1 *)lmm);
2057                         if (S_ISREG(body->mbo_mode))
2058                                 lustre_swab_lov_user_md_objects(
2059                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2060                                     stripe_count);
2061                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2062                         lustre_swab_lov_user_md_v3(
2063                                         (struct lov_user_md_v3 *)lmm);
2064                         if (S_ISREG(body->mbo_mode))
2065                                 lustre_swab_lov_user_md_objects(
2066                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2067                                     stripe_count);
2068                 } else if (lmm->lmm_magic ==
2069                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2070                         lustre_swab_lov_comp_md_v1(
2071                                         (struct lov_comp_md_v1 *)lmm);
2072                 }
2073         }
2074
2075 out:
2076         *lmmp = lmm;
2077         *lmm_size = lmmsize;
2078         *request = req;
2079         return rc;
2080 }
2081
2082 static int ll_lov_setea(struct inode *inode, struct file *file,
2083                         void __user *arg)
2084 {
2085         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2086         struct lov_user_md      *lump;
2087         int                      lum_size = sizeof(struct lov_user_md) +
2088                                             sizeof(struct lov_user_ost_data);
2089         int                      rc;
2090         ENTRY;
2091
2092         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2093                 RETURN(-EPERM);
2094
2095         OBD_ALLOC_LARGE(lump, lum_size);
2096         if (lump == NULL)
2097                 RETURN(-ENOMEM);
2098
2099         if (copy_from_user(lump, arg, lum_size))
2100                 GOTO(out_lump, rc = -EFAULT);
2101
2102         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2103                                       lum_size);
2104         cl_lov_delay_create_clear(&file->f_flags);
2105
2106 out_lump:
2107         OBD_FREE_LARGE(lump, lum_size);
2108         RETURN(rc);
2109 }
2110
2111 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2112 {
2113         struct lu_env   *env;
2114         __u16           refcheck;
2115         int             rc;
2116         ENTRY;
2117
2118         env = cl_env_get(&refcheck);
2119         if (IS_ERR(env))
2120                 RETURN(PTR_ERR(env));
2121
2122         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2123         cl_env_put(env, &refcheck);
2124         RETURN(rc);
2125 }
2126
2127 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2128                             void __user *arg)
2129 {
2130         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2131         struct lov_user_md        *klum;
2132         int                        lum_size, rc;
2133         __u64                      flags = FMODE_WRITE;
2134         ENTRY;
2135
2136         rc = ll_copy_user_md(lum, &klum);
2137         if (rc < 0)
2138                 RETURN(rc);
2139
2140         lum_size = rc;
2141         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2142                                       lum_size);
2143         if (!rc) {
2144                 __u32 gen;
2145
2146                 rc = put_user(0, &lum->lmm_stripe_count);
2147                 if (rc)
2148                         GOTO(out, rc);
2149
2150                 rc = ll_layout_refresh(inode, &gen);
2151                 if (rc)
2152                         GOTO(out, rc);
2153
2154                 rc = ll_file_getstripe(inode, arg, lum_size);
2155         }
2156         cl_lov_delay_create_clear(&file->f_flags);
2157
2158 out:
2159         OBD_FREE(klum, lum_size);
2160         RETURN(rc);
2161 }
2162
2163 static int
2164 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2165 {
2166         struct ll_inode_info *lli = ll_i2info(inode);
2167         struct cl_object *obj = lli->lli_clob;
2168         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2169         struct ll_grouplock grouplock;
2170         int rc;
2171         ENTRY;
2172
2173         if (arg == 0) {
2174                 CWARN("group id for group lock must not be 0\n");
2175                 RETURN(-EINVAL);
2176         }
2177
2178         if (ll_file_nolock(file))
2179                 RETURN(-EOPNOTSUPP);
2180
2181         spin_lock(&lli->lli_lock);
2182         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2183                 CWARN("group lock already existed with gid %lu\n",
2184                       fd->fd_grouplock.lg_gid);
2185                 spin_unlock(&lli->lli_lock);
2186                 RETURN(-EINVAL);
2187         }
2188         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2189         spin_unlock(&lli->lli_lock);
2190
2191         /**
2192          * XXX: group lock needs to protect all OST objects while PFL
2193          * can add new OST objects during the IO, so we'd instantiate
2194          * all OST objects before getting its group lock.
2195          */
2196         if (obj) {
2197                 struct lu_env *env;
2198                 __u16 refcheck;
2199                 struct cl_layout cl = {
2200                         .cl_is_composite = false,
2201                 };
2202                 struct lu_extent ext = {
2203                         .e_start = 0,
2204                         .e_end = OBD_OBJECT_EOF,
2205                 };
2206
2207                 env = cl_env_get(&refcheck);
2208                 if (IS_ERR(env))
2209                         RETURN(PTR_ERR(env));
2210
2211                 rc = cl_object_layout_get(env, obj, &cl);
2212                 if (!rc && cl.cl_is_composite)
2213                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2214                                                     &ext);
2215
2216                 cl_env_put(env, &refcheck);
2217                 if (rc)
2218                         RETURN(rc);
2219         }
2220
2221         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2222                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2223         if (rc)
2224                 RETURN(rc);
2225
2226         spin_lock(&lli->lli_lock);
2227         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2228                 spin_unlock(&lli->lli_lock);
2229                 CERROR("another thread just won the race\n");
2230                 cl_put_grouplock(&grouplock);
2231                 RETURN(-EINVAL);
2232         }
2233
2234         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2235         fd->fd_grouplock = grouplock;
2236         spin_unlock(&lli->lli_lock);
2237
2238         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2239         RETURN(0);
2240 }
2241
2242 static int ll_put_grouplock(struct inode *inode, struct file *file,
2243                             unsigned long arg)
2244 {
2245         struct ll_inode_info   *lli = ll_i2info(inode);
2246         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2247         struct ll_grouplock     grouplock;
2248         ENTRY;
2249
2250         spin_lock(&lli->lli_lock);
2251         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2252                 spin_unlock(&lli->lli_lock);
2253                 CWARN("no group lock held\n");
2254                 RETURN(-EINVAL);
2255         }
2256
2257         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2258
2259         if (fd->fd_grouplock.lg_gid != arg) {
2260                 CWARN("group lock %lu doesn't match current id %lu\n",
2261                       arg, fd->fd_grouplock.lg_gid);
2262                 spin_unlock(&lli->lli_lock);
2263                 RETURN(-EINVAL);
2264         }
2265
2266         grouplock = fd->fd_grouplock;
2267         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2268         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2269         spin_unlock(&lli->lli_lock);
2270
2271         cl_put_grouplock(&grouplock);
2272         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2273         RETURN(0);
2274 }
2275
2276 /**
2277  * Close inode open handle
2278  *
2279  * \param dentry [in]     dentry which contains the inode
2280  * \param it     [in,out] intent which contains open info and result
2281  *
2282  * \retval 0     success
2283  * \retval <0    failure
2284  */
2285 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2286 {
2287         struct inode *inode = dentry->d_inode;
2288         struct obd_client_handle *och;
2289         int rc;
2290         ENTRY;
2291
2292         LASSERT(inode);
2293
2294         /* Root ? Do nothing. */
2295         if (dentry->d_inode->i_sb->s_root == dentry)
2296                 RETURN(0);
2297
2298         /* No open handle to close? Move away */
2299         if (!it_disposition(it, DISP_OPEN_OPEN))
2300                 RETURN(0);
2301
2302         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2303
2304         OBD_ALLOC(och, sizeof(*och));
2305         if (!och)
2306                 GOTO(out, rc = -ENOMEM);
2307
2308         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2309
2310         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2311 out:
2312         /* this one is in place of ll_file_open */
2313         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2314                 ptlrpc_req_finished(it->it_request);
2315                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2316         }
2317         RETURN(rc);
2318 }
2319
2320 /**
2321  * Get size for inode for which FIEMAP mapping is requested.
2322  * Make the FIEMAP get_info call and returns the result.
2323  * \param fiemap        kernel buffer to hold extens
2324  * \param num_bytes     kernel buffer size
2325  */
2326 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2327                         size_t num_bytes)
2328 {
2329         struct lu_env                   *env;
2330         __u16                           refcheck;
2331         int                             rc = 0;
2332         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2333         ENTRY;
2334
2335         /* Checks for fiemap flags */
2336         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2337                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2338                 return -EBADR;
2339         }
2340
2341         /* Check for FIEMAP_FLAG_SYNC */
2342         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2343                 rc = filemap_fdatawrite(inode->i_mapping);
2344                 if (rc)
2345                         return rc;
2346         }
2347
2348         env = cl_env_get(&refcheck);
2349         if (IS_ERR(env))
2350                 RETURN(PTR_ERR(env));
2351
2352         if (i_size_read(inode) == 0) {
2353                 rc = ll_glimpse_size(inode);
2354                 if (rc)
2355                         GOTO(out, rc);
2356         }
2357
2358         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2359         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2360         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2361
2362         /* If filesize is 0, then there would be no objects for mapping */
2363         if (fmkey.lfik_oa.o_size == 0) {
2364                 fiemap->fm_mapped_extents = 0;
2365                 GOTO(out, rc = 0);
2366         }
2367
2368         fmkey.lfik_fiemap = *fiemap;
2369
2370         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2371                               &fmkey, fiemap, &num_bytes);
2372 out:
2373         cl_env_put(env, &refcheck);
2374         RETURN(rc);
2375 }
2376
2377 int ll_fid2path(struct inode *inode, void __user *arg)
2378 {
2379         struct obd_export       *exp = ll_i2mdexp(inode);
2380         const struct getinfo_fid2path __user *gfin = arg;
2381         __u32                    pathlen;
2382         struct getinfo_fid2path *gfout;
2383         size_t                   outsize;
2384         int                      rc;
2385
2386         ENTRY;
2387
2388         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2389             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2390                 RETURN(-EPERM);
2391
2392         /* Only need to get the buflen */
2393         if (get_user(pathlen, &gfin->gf_pathlen))
2394                 RETURN(-EFAULT);
2395
2396         if (pathlen > PATH_MAX)
2397                 RETURN(-EINVAL);
2398
2399         outsize = sizeof(*gfout) + pathlen;
2400         OBD_ALLOC(gfout, outsize);
2401         if (gfout == NULL)
2402                 RETURN(-ENOMEM);
2403
2404         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2405                 GOTO(gf_free, rc = -EFAULT);
2406         /* append root FID after gfout to let MDT know the root FID so that it
2407          * can lookup the correct path, this is mainly for fileset.
2408          * old server without fileset mount support will ignore this. */
2409         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2410
2411         /* Call mdc_iocontrol */
2412         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2413         if (rc != 0)
2414                 GOTO(gf_free, rc);
2415
2416         if (copy_to_user(arg, gfout, outsize))
2417                 rc = -EFAULT;
2418
2419 gf_free:
2420         OBD_FREE(gfout, outsize);
2421         RETURN(rc);
2422 }
2423
2424 static int
2425 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2426 {
2427         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2428         struct lu_env *env;
2429         struct cl_io *io;
2430         __u16  refcheck;
2431         int result;
2432
2433         ENTRY;
2434
2435         ioc->idv_version = 0;
2436         ioc->idv_layout_version = UINT_MAX;
2437
2438         /* If no file object initialized, we consider its version is 0. */
2439         if (obj == NULL)
2440                 RETURN(0);
2441
2442         env = cl_env_get(&refcheck);
2443         if (IS_ERR(env))
2444                 RETURN(PTR_ERR(env));
2445
2446         io = vvp_env_thread_io(env);
2447         io->ci_obj = obj;
2448         io->u.ci_data_version.dv_data_version = 0;
2449         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2450         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2451
2452 restart:
2453         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2454                 result = cl_io_loop(env, io);
2455         else
2456                 result = io->ci_result;
2457
2458         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2459         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2460
2461         cl_io_fini(env, io);
2462
2463         if (unlikely(io->ci_need_restart))
2464                 goto restart;
2465
2466         cl_env_put(env, &refcheck);
2467
2468         RETURN(result);
2469 }
2470
2471 /*
2472  * Read the data_version for inode.
2473  *
2474  * This value is computed using stripe object version on OST.
2475  * Version is computed using server side locking.
2476  *
2477  * @param flags if do sync on the OST side;
2478  *              0: no sync
2479  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2480  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2481  */
2482 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2483 {
2484         struct ioc_data_version ioc = { .idv_flags = flags };
2485         int rc;
2486
2487         rc = ll_ioc_data_version(inode, &ioc);
2488         if (!rc)
2489                 *data_version = ioc.idv_version;
2490
2491         return rc;
2492 }
2493
2494 /*
2495  * Trigger a HSM release request for the provided inode.
2496  */
2497 int ll_hsm_release(struct inode *inode)
2498 {
2499         struct lu_env *env;
2500         struct obd_client_handle *och = NULL;
2501         __u64 data_version = 0;
2502         int rc;
2503         __u16 refcheck;
2504         ENTRY;
2505
2506         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2507                ll_get_fsname(inode->i_sb, NULL, 0),
2508                PFID(&ll_i2info(inode)->lli_fid));
2509
2510         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2511         if (IS_ERR(och))
2512                 GOTO(out, rc = PTR_ERR(och));
2513
2514         /* Grab latest data_version and [am]time values */
2515         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2516         if (rc != 0)
2517                 GOTO(out, rc);
2518
2519         env = cl_env_get(&refcheck);
2520         if (IS_ERR(env))
2521                 GOTO(out, rc = PTR_ERR(env));
2522
2523         rc = ll_merge_attr(env, inode);
2524         cl_env_put(env, &refcheck);
2525
2526         /* If error happen, we have the wrong size for a file.
2527          * Don't release it.
2528          */
2529         if (rc != 0)
2530                 GOTO(out, rc);
2531
2532         /* Release the file.
2533          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2534          * we still need it to pack l_remote_handle to MDT. */
2535         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2536                                        &data_version);
2537         och = NULL;
2538
2539         EXIT;
2540 out:
2541         if (och != NULL && !IS_ERR(och)) /* close the file */
2542                 ll_lease_close(och, inode, NULL);
2543
2544         return rc;
2545 }
2546
2547 struct ll_swap_stack {
2548         __u64                    dv1;
2549         __u64                    dv2;
2550         struct inode            *inode1;
2551         struct inode            *inode2;
2552         bool                     check_dv1;
2553         bool                     check_dv2;
2554 };
2555
2556 static int ll_swap_layouts(struct file *file1, struct file *file2,
2557                            struct lustre_swap_layouts *lsl)
2558 {
2559         struct mdc_swap_layouts  msl;
2560         struct md_op_data       *op_data;
2561         __u32                    gid;
2562         __u64                    dv;
2563         struct ll_swap_stack    *llss = NULL;
2564         int                      rc;
2565
2566         OBD_ALLOC_PTR(llss);
2567         if (llss == NULL)
2568                 RETURN(-ENOMEM);
2569
2570         llss->inode1 = file_inode(file1);
2571         llss->inode2 = file_inode(file2);
2572
2573         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2574         if (rc < 0)
2575                 GOTO(free, rc);
2576
2577         /* we use 2 bool because it is easier to swap than 2 bits */
2578         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2579                 llss->check_dv1 = true;
2580
2581         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2582                 llss->check_dv2 = true;
2583
2584         /* we cannot use lsl->sl_dvX directly because we may swap them */
2585         llss->dv1 = lsl->sl_dv1;
2586         llss->dv2 = lsl->sl_dv2;
2587
2588         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2589         if (rc == 0) /* same file, done! */
2590                 GOTO(free, rc);
2591
2592         if (rc < 0) { /* sequentialize it */
2593                 swap(llss->inode1, llss->inode2);
2594                 swap(file1, file2);
2595                 swap(llss->dv1, llss->dv2);
2596                 swap(llss->check_dv1, llss->check_dv2);
2597         }
2598
2599         gid = lsl->sl_gid;
2600         if (gid != 0) { /* application asks to flush dirty cache */
2601                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2602                 if (rc < 0)
2603                         GOTO(free, rc);
2604
2605                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2606                 if (rc < 0) {
2607                         ll_put_grouplock(llss->inode1, file1, gid);
2608                         GOTO(free, rc);
2609                 }
2610         }
2611
2612         /* ultimate check, before swaping the layouts we check if
2613          * dataversion has changed (if requested) */
2614         if (llss->check_dv1) {
2615                 rc = ll_data_version(llss->inode1, &dv, 0);
2616                 if (rc)
2617                         GOTO(putgl, rc);
2618                 if (dv != llss->dv1)
2619                         GOTO(putgl, rc = -EAGAIN);
2620         }
2621
2622         if (llss->check_dv2) {
2623                 rc = ll_data_version(llss->inode2, &dv, 0);
2624                 if (rc)
2625                         GOTO(putgl, rc);
2626                 if (dv != llss->dv2)
2627                         GOTO(putgl, rc = -EAGAIN);
2628         }
2629
2630         /* struct md_op_data is used to send the swap args to the mdt
2631          * only flags is missing, so we use struct mdc_swap_layouts
2632          * through the md_op_data->op_data */
2633         /* flags from user space have to be converted before they are send to
2634          * server, no flag is sent today, they are only used on the client */
2635         msl.msl_flags = 0;
2636         rc = -ENOMEM;
2637         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2638                                      0, LUSTRE_OPC_ANY, &msl);
2639         if (IS_ERR(op_data))
2640                 GOTO(free, rc = PTR_ERR(op_data));
2641
2642         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2643                            sizeof(*op_data), op_data, NULL);
2644         ll_finish_md_op_data(op_data);
2645
2646         if (rc < 0)
2647                 GOTO(putgl, rc);
2648
2649 putgl:
2650         if (gid != 0) {
2651                 ll_put_grouplock(llss->inode2, file2, gid);
2652                 ll_put_grouplock(llss->inode1, file1, gid);
2653         }
2654
2655 free:
2656         if (llss != NULL)
2657                 OBD_FREE_PTR(llss);
2658
2659         RETURN(rc);
2660 }
2661
2662 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2663 {
2664         struct md_op_data       *op_data;
2665         int                      rc;
2666         ENTRY;
2667
2668         /* Detect out-of range masks */
2669         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2670                 RETURN(-EINVAL);
2671
2672         /* Non-root users are forbidden to set or clear flags which are
2673          * NOT defined in HSM_USER_MASK. */
2674         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2675             !cfs_capable(CFS_CAP_SYS_ADMIN))
2676                 RETURN(-EPERM);
2677
2678         /* Detect out-of range archive id */
2679         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2680             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2681                 RETURN(-EINVAL);
2682
2683         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2684                                      LUSTRE_OPC_ANY, hss);
2685         if (IS_ERR(op_data))
2686                 RETURN(PTR_ERR(op_data));
2687
2688         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2689                            sizeof(*op_data), op_data, NULL);
2690
2691         ll_finish_md_op_data(op_data);
2692
2693         RETURN(rc);
2694 }
2695
2696 static int ll_hsm_import(struct inode *inode, struct file *file,
2697                          struct hsm_user_import *hui)
2698 {
2699         struct hsm_state_set    *hss = NULL;
2700         struct iattr            *attr = NULL;
2701         int                      rc;
2702         ENTRY;
2703
2704         if (!S_ISREG(inode->i_mode))
2705                 RETURN(-EINVAL);
2706
2707         /* set HSM flags */
2708         OBD_ALLOC_PTR(hss);
2709         if (hss == NULL)
2710                 GOTO(out, rc = -ENOMEM);
2711
2712         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2713         hss->hss_archive_id = hui->hui_archive_id;
2714         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2715         rc = ll_hsm_state_set(inode, hss);
2716         if (rc != 0)
2717                 GOTO(out, rc);
2718
2719         OBD_ALLOC_PTR(attr);
2720         if (attr == NULL)
2721                 GOTO(out, rc = -ENOMEM);
2722
2723         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2724         attr->ia_mode |= S_IFREG;
2725         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2726         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2727         attr->ia_size = hui->hui_size;
2728         attr->ia_mtime.tv_sec = hui->hui_mtime;
2729         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2730         attr->ia_atime.tv_sec = hui->hui_atime;
2731         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2732
2733         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2734                          ATTR_UID | ATTR_GID |
2735                          ATTR_MTIME | ATTR_MTIME_SET |
2736                          ATTR_ATIME | ATTR_ATIME_SET;
2737
2738         inode_lock(inode);
2739
2740         rc = ll_setattr_raw(file_dentry(file), attr, true);
2741         if (rc == -ENODATA)
2742                 rc = 0;
2743
2744         inode_unlock(inode);
2745
2746 out:
2747         if (hss != NULL)
2748                 OBD_FREE_PTR(hss);
2749
2750         if (attr != NULL)
2751                 OBD_FREE_PTR(attr);
2752
2753         RETURN(rc);
2754 }
2755
2756 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2757 {
2758         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2759                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2760 }
2761
2762 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2763 {
2764         struct inode *inode = file_inode(file);
2765         struct iattr ia = {
2766                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2767                             ATTR_MTIME | ATTR_MTIME_SET |
2768                             ATTR_CTIME | ATTR_CTIME_SET,
2769                 .ia_atime = {
2770                         .tv_sec = lfu->lfu_atime_sec,
2771                         .tv_nsec = lfu->lfu_atime_nsec,
2772                 },
2773                 .ia_mtime = {
2774                         .tv_sec = lfu->lfu_mtime_sec,
2775                         .tv_nsec = lfu->lfu_mtime_nsec,
2776                 },
2777                 .ia_ctime = {
2778                         .tv_sec = lfu->lfu_ctime_sec,
2779                         .tv_nsec = lfu->lfu_ctime_nsec,
2780                 },
2781         };
2782         int rc;
2783         ENTRY;
2784
2785         if (!capable(CAP_SYS_ADMIN))
2786                 RETURN(-EPERM);
2787
2788         if (!S_ISREG(inode->i_mode))
2789                 RETURN(-EINVAL);
2790
2791         inode_lock(inode);
2792         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2793         inode_unlock(inode);
2794
2795         RETURN(rc);
2796 }
2797
2798 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2799 {
2800         switch (mode) {
2801         case MODE_READ_USER:
2802                 return CLM_READ;
2803         case MODE_WRITE_USER:
2804                 return CLM_WRITE;
2805         default:
2806                 return -EINVAL;
2807         }
2808 }
2809
2810 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2811
2812 /* Used to allow the upper layers of the client to request an LDLM lock
2813  * without doing an actual read or write.
2814  *
2815  * Used for ladvise lockahead to manually request specific locks.
2816  *
2817  * \param[in] file      file this ladvise lock request is on
2818  * \param[in] ladvise   ladvise struct describing this lock request
2819  *
2820  * \retval 0            success, no detailed result available (sync requests
2821  *                      and requests sent to the server [not handled locally]
2822  *                      cannot return detailed results)
2823  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2824  *                                       see definitions for details.
2825  * \retval negative     negative errno on error
2826  */
2827 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2828 {
2829         struct lu_env *env = NULL;
2830         struct cl_io *io  = NULL;
2831         struct cl_lock *lock = NULL;
2832         struct cl_lock_descr *descr = NULL;
2833         struct dentry *dentry = file->f_path.dentry;
2834         struct inode *inode = dentry->d_inode;
2835         enum cl_lock_mode cl_mode;
2836         off_t start = ladvise->lla_start;
2837         off_t end = ladvise->lla_end;
2838         int result;
2839         __u16 refcheck;
2840
2841         ENTRY;
2842
2843         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2844                "start=%llu, end=%llu\n", dentry->d_name.len,
2845                dentry->d_name.name, dentry->d_inode,
2846                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2847                (__u64) end);
2848
2849         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2850         if (cl_mode < 0)
2851                 GOTO(out, result = cl_mode);
2852
2853         /* Get IO environment */
2854         result = cl_io_get(inode, &env, &io, &refcheck);
2855         if (result <= 0)
2856                 GOTO(out, result);
2857
2858         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2859         if (result > 0) {
2860                 /*
2861                  * nothing to do for this io. This currently happens when
2862                  * stripe sub-object's are not yet created.
2863                  */
2864                 result = io->ci_result;
2865         } else if (result == 0) {
2866                 lock = vvp_env_lock(env);
2867                 descr = &lock->cll_descr;
2868
2869                 descr->cld_obj   = io->ci_obj;
2870                 /* Convert byte offsets to pages */
2871                 descr->cld_start = cl_index(io->ci_obj, start);
2872                 descr->cld_end   = cl_index(io->ci_obj, end);
2873                 descr->cld_mode  = cl_mode;
2874                 /* CEF_MUST is used because we do not want to convert a
2875                  * lockahead request to a lockless lock */
2876                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2877                                        CEF_NONBLOCK;
2878
2879                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2880                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2881
2882                 result = cl_lock_request(env, io, lock);
2883
2884                 /* On success, we need to release the lock */
2885                 if (result >= 0)
2886                         cl_lock_release(env, lock);
2887         }
2888         cl_io_fini(env, io);
2889         cl_env_put(env, &refcheck);
2890
2891         /* -ECANCELED indicates a matching lock with a different extent
2892          * was already present, and -EEXIST indicates a matching lock
2893          * on exactly the same extent was already present.
2894          * We convert them to positive values for userspace to make
2895          * recognizing true errors easier.
2896          * Note we can only return these detailed results on async requests,
2897          * as sync requests look the same as i/o requests for locking. */
2898         if (result == -ECANCELED)
2899                 result = LLA_RESULT_DIFFERENT;
2900         else if (result == -EEXIST)
2901                 result = LLA_RESULT_SAME;
2902
2903 out:
2904         RETURN(result);
2905 }
2906 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2907
2908 static int ll_ladvise_sanity(struct inode *inode,
2909                              struct llapi_lu_ladvise *ladvise)
2910 {
2911         enum lu_ladvise_type advice = ladvise->lla_advice;
2912         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2913          * be in the first 32 bits of enum ladvise_flags */
2914         __u32 flags = ladvise->lla_peradvice_flags;
2915         /* 3 lines at 80 characters per line, should be plenty */
2916         int rc = 0;
2917
2918         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2919                 rc = -EINVAL;
2920                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2921                        "last supported advice is %s (value '%d'): rc = %d\n",
2922                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2923                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2924                 GOTO(out, rc);
2925         }
2926
2927         /* Per-advice checks */
2928         switch (advice) {
2929         case LU_LADVISE_LOCKNOEXPAND:
2930                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2931                         rc = -EINVAL;
2932                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2933                                "rc = %d\n",
2934                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2935                                ladvise_names[advice], rc);
2936                         GOTO(out, rc);
2937                 }
2938                 break;
2939         case LU_LADVISE_LOCKAHEAD:
2940                 /* Currently only READ and WRITE modes can be requested */
2941                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2942                     ladvise->lla_lockahead_mode == 0) {
2943                         rc = -EINVAL;
2944                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2945                                "rc = %d\n",
2946                                ll_get_fsname(inode->i_sb, NULL, 0),
2947                                ladvise->lla_lockahead_mode,
2948                                ladvise_names[advice], rc);
2949                         GOTO(out, rc);
2950                 }
2951         case LU_LADVISE_WILLREAD:
2952         case LU_LADVISE_DONTNEED:
2953         default:
2954                 /* Note fall through above - These checks apply to all advices
2955                  * except LOCKNOEXPAND */
2956                 if (flags & ~LF_DEFAULT_MASK) {
2957                         rc = -EINVAL;
2958                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2959                                "rc = %d\n",
2960                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2961                                ladvise_names[advice], rc);
2962                         GOTO(out, rc);
2963                 }
2964                 if (ladvise->lla_start >= ladvise->lla_end) {
2965                         rc = -EINVAL;
2966                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2967                                "for %s: rc = %d\n",
2968                                ll_get_fsname(inode->i_sb, NULL, 0),
2969                                ladvise->lla_start, ladvise->lla_end,
2970                                ladvise_names[advice], rc);
2971                         GOTO(out, rc);
2972                 }
2973                 break;
2974         }
2975
2976 out:
2977         return rc;
2978 }
2979 #undef ERRSIZE
2980
2981 /*
2982  * Give file access advices
2983  *
2984  * The ladvise interface is similar to Linux fadvise() system call, except it
2985  * forwards the advices directly from Lustre client to server. The server side
2986  * codes will apply appropriate read-ahead and caching techniques for the
2987  * corresponding files.
2988  *
2989  * A typical workload for ladvise is e.g. a bunch of different clients are
2990  * doing small random reads of a file, so prefetching pages into OSS cache
2991  * with big linear reads before the random IO is a net benefit. Fetching
2992  * all that data into each client cache with fadvise() may not be, due to
2993  * much more data being sent to the client.
2994  */
2995 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2996                       struct llapi_lu_ladvise *ladvise)
2997 {
2998         struct lu_env *env;
2999         struct cl_io *io;
3000         struct cl_ladvise_io *lio;
3001         int rc;
3002         __u16 refcheck;
3003         ENTRY;
3004
3005         env = cl_env_get(&refcheck);
3006         if (IS_ERR(env))
3007                 RETURN(PTR_ERR(env));
3008
3009         io = vvp_env_thread_io(env);
3010         io->ci_obj = ll_i2info(inode)->lli_clob;
3011
3012         /* initialize parameters for ladvise */
3013         lio = &io->u.ci_ladvise;
3014         lio->li_start = ladvise->lla_start;
3015         lio->li_end = ladvise->lla_end;
3016         lio->li_fid = ll_inode2fid(inode);
3017         lio->li_advice = ladvise->lla_advice;
3018         lio->li_flags = flags;
3019
3020         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3021                 rc = cl_io_loop(env, io);
3022         else
3023                 rc = io->ci_result;
3024
3025         cl_io_fini(env, io);
3026         cl_env_put(env, &refcheck);
3027         RETURN(rc);
3028 }
3029
3030 static int ll_lock_noexpand(struct file *file, int flags)
3031 {
3032         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3033
3034         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3035
3036         return 0;
3037 }
3038
3039 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3040                         unsigned long arg)
3041 {
3042         struct fsxattr fsxattr;
3043
3044         if (copy_from_user(&fsxattr,
3045                            (const struct fsxattr __user *)arg,
3046                            sizeof(fsxattr)))
3047                 RETURN(-EFAULT);
3048
3049         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
3050         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3051         if (copy_to_user((struct fsxattr __user *)arg,
3052                          &fsxattr, sizeof(fsxattr)))
3053                 RETURN(-EFAULT);
3054
3055         RETURN(0);
3056 }
3057
3058 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3059                         unsigned long arg)
3060 {
3061
3062         struct md_op_data *op_data;
3063         struct ptlrpc_request *req = NULL;
3064         int rc = 0;
3065         struct fsxattr fsxattr;
3066         struct cl_object *obj;
3067
3068         /* only root could change project ID */
3069         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3070                 RETURN(-EPERM);
3071
3072         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3073                                      LUSTRE_OPC_ANY, NULL);
3074         if (IS_ERR(op_data))
3075                 RETURN(PTR_ERR(op_data));
3076
3077         if (copy_from_user(&fsxattr,
3078                            (const struct fsxattr __user *)arg,
3079                            sizeof(fsxattr)))
3080                 GOTO(out_fsxattr1, rc = -EFAULT);
3081
3082         op_data->op_attr_flags = fsxattr.fsx_xflags;
3083         op_data->op_projid = fsxattr.fsx_projid;
3084         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3085         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3086                         0, &req);
3087         ptlrpc_req_finished(req);
3088
3089         obj = ll_i2info(inode)->lli_clob;
3090         if (obj) {
3091                 struct iattr *attr;
3092
3093                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
3094                 OBD_ALLOC_PTR(attr);
3095                 if (attr == NULL)
3096                         GOTO(out_fsxattr1, rc = -ENOMEM);
3097                 attr->ia_valid = ATTR_ATTR_FLAG;
3098                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3099
3100                 OBD_FREE_PTR(attr);
3101         }
3102 out_fsxattr1:
3103         ll_finish_md_op_data(op_data);
3104         RETURN(rc);
3105 }
3106
3107 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3108                                  unsigned long arg)
3109 {
3110         struct inode            *inode = file_inode(file);
3111         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3112         struct ll_inode_info    *lli = ll_i2info(inode);
3113         struct obd_client_handle *och = NULL;
3114         struct split_param sp;
3115         bool lease_broken;
3116         fmode_t fmode = 0;
3117         enum mds_op_bias bias = 0;
3118         struct file *layout_file = NULL;
3119         void *data = NULL;
3120         size_t data_size = 0;
3121         long rc;
3122         ENTRY;
3123
3124         mutex_lock(&lli->lli_och_mutex);
3125         if (fd->fd_lease_och != NULL) {
3126                 och = fd->fd_lease_och;
3127                 fd->fd_lease_och = NULL;
3128         }
3129         mutex_unlock(&lli->lli_och_mutex);
3130
3131         if (och == NULL)
3132                 GOTO(out, rc = -ENOLCK);
3133
3134         fmode = och->och_flags;
3135
3136         switch (ioc->lil_flags) {
3137         case LL_LEASE_RESYNC_DONE:
3138                 if (ioc->lil_count > IOC_IDS_MAX)
3139                         GOTO(out, rc = -EINVAL);
3140
3141                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3142                 OBD_ALLOC(data, data_size);
3143                 if (!data)
3144                         GOTO(out, rc = -ENOMEM);
3145
3146                 if (copy_from_user(data, (void __user *)arg, data_size))
3147                         GOTO(out, rc = -EFAULT);
3148
3149                 bias = MDS_CLOSE_RESYNC_DONE;
3150                 break;
3151         case LL_LEASE_LAYOUT_MERGE: {
3152                 int fd;
3153
3154                 if (ioc->lil_count != 1)
3155                         GOTO(out, rc = -EINVAL);
3156
3157                 arg += sizeof(*ioc);
3158                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3159                         GOTO(out, rc = -EFAULT);
3160
3161                 layout_file = fget(fd);
3162                 if (!layout_file)
3163                         GOTO(out, rc = -EBADF);
3164
3165                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3166                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3167                         GOTO(out, rc = -EPERM);
3168
3169                 data = file_inode(layout_file);
3170                 bias = MDS_CLOSE_LAYOUT_MERGE;
3171                 break;
3172         }
3173         case LL_LEASE_LAYOUT_SPLIT: {
3174                 int fdv;
3175                 int mirror_id;
3176
3177                 if (ioc->lil_count != 2)
3178                         GOTO(out, rc = -EINVAL);
3179
3180                 arg += sizeof(*ioc);
3181                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3182                         GOTO(out, rc = -EFAULT);
3183
3184                 arg += sizeof(__u32);
3185                 if (copy_from_user(&mirror_id, (void __user *)arg,
3186                                    sizeof(__u32)))
3187                         GOTO(out, rc = -EFAULT);
3188
3189                 layout_file = fget(fdv);
3190                 if (!layout_file)
3191                         GOTO(out, rc = -EBADF);
3192
3193                 sp.sp_inode = file_inode(layout_file);
3194                 sp.sp_mirror_id = (__u16)mirror_id;
3195                 data = &sp;
3196                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3197                 break;
3198         }
3199         default:
3200                 /* without close intent */
3201                 break;
3202         }
3203
3204         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3205         if (rc < 0)
3206                 GOTO(out, rc);
3207
3208         rc = ll_lease_och_release(inode, file);
3209         if (rc < 0)
3210                 GOTO(out, rc);
3211
3212         if (lease_broken)
3213                 fmode = 0;
3214         EXIT;
3215
3216 out:
3217         switch (ioc->lil_flags) {
3218         case LL_LEASE_RESYNC_DONE:
3219                 if (data)
3220                         OBD_FREE(data, data_size);
3221                 break;
3222         case LL_LEASE_LAYOUT_MERGE:
3223         case LL_LEASE_LAYOUT_SPLIT:
3224                 if (layout_file)
3225                         fput(layout_file);
3226                 break;
3227         }
3228
3229         if (!rc)
3230                 rc = ll_lease_type_from_fmode(fmode);
3231         RETURN(rc);
3232 }
3233
3234 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3235                               unsigned long arg)
3236 {
3237         struct inode *inode = file_inode(file);
3238         struct ll_inode_info *lli = ll_i2info(inode);
3239         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3240         struct obd_client_handle *och = NULL;
3241         __u64 open_flags = 0;
3242         bool lease_broken;
3243         fmode_t fmode;
3244         long rc;
3245         ENTRY;
3246
3247         switch (ioc->lil_mode) {
3248         case LL_LEASE_WRLCK:
3249                 if (!(file->f_mode & FMODE_WRITE))
3250                         RETURN(-EPERM);
3251                 fmode = FMODE_WRITE;
3252                 break;
3253         case LL_LEASE_RDLCK:
3254                 if (!(file->f_mode & FMODE_READ))
3255                         RETURN(-EPERM);
3256                 fmode = FMODE_READ;
3257                 break;
3258         case LL_LEASE_UNLCK:
3259                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3260         default:
3261                 RETURN(-EINVAL);
3262         }
3263
3264         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3265
3266         /* apply for lease */
3267         if (ioc->lil_flags & LL_LEASE_RESYNC)
3268                 open_flags = MDS_OPEN_RESYNC;
3269         och = ll_lease_open(inode, file, fmode, open_flags);
3270         if (IS_ERR(och))
3271                 RETURN(PTR_ERR(och));
3272
3273         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3274                 rc = ll_lease_file_resync(och, inode);
3275                 if (rc) {
3276                         ll_lease_close(och, inode, NULL);
3277                         RETURN(rc);
3278                 }
3279                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3280                 if (rc) {
3281                         ll_lease_close(och, inode, NULL);
3282                         RETURN(rc);
3283                 }
3284         }
3285
3286         rc = 0;
3287         mutex_lock(&lli->lli_och_mutex);
3288         if (fd->fd_lease_och == NULL) {
3289                 fd->fd_lease_och = och;
3290                 och = NULL;
3291         }
3292         mutex_unlock(&lli->lli_och_mutex);
3293         if (och != NULL) {
3294                 /* impossible now that only excl is supported for now */
3295                 ll_lease_close(och, inode, &lease_broken);
3296                 rc = -EBUSY;
3297         }
3298         RETURN(rc);
3299 }
3300
3301 static long
3302 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3303 {
3304         struct inode            *inode = file_inode(file);
3305         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3306         int                      flags, rc;
3307         ENTRY;
3308
3309         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3310                PFID(ll_inode2fid(inode)), inode, cmd);
3311         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3312
3313         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3314         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3315                 RETURN(-ENOTTY);
3316
3317         switch (cmd) {
3318         case LL_IOC_GETFLAGS:
3319                 /* Get the current value of the file flags */
3320                 return put_user(fd->fd_flags, (int __user *)arg);
3321         case LL_IOC_SETFLAGS:
3322         case LL_IOC_CLRFLAGS:
3323                 /* Set or clear specific file flags */
3324                 /* XXX This probably needs checks to ensure the flags are
3325                  *     not abused, and to handle any flag side effects.
3326                  */
3327                 if (get_user(flags, (int __user *) arg))
3328                         RETURN(-EFAULT);
3329
3330                 if (cmd == LL_IOC_SETFLAGS) {
3331                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3332                             !(file->f_flags & O_DIRECT)) {
3333                                 CERROR("%s: unable to disable locking on "
3334                                        "non-O_DIRECT file\n", current->comm);
3335                                 RETURN(-EINVAL);
3336                         }
3337
3338                         fd->fd_flags |= flags;
3339                 } else {
3340                         fd->fd_flags &= ~flags;
3341                 }
3342                 RETURN(0);
3343         case LL_IOC_LOV_SETSTRIPE:
3344         case LL_IOC_LOV_SETSTRIPE_NEW:
3345                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3346         case LL_IOC_LOV_SETEA:
3347                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3348         case LL_IOC_LOV_SWAP_LAYOUTS: {
3349                 struct file *file2;
3350                 struct lustre_swap_layouts lsl;
3351
3352                 if (copy_from_user(&lsl, (char __user *)arg,
3353                                    sizeof(struct lustre_swap_layouts)))
3354                         RETURN(-EFAULT);
3355
3356                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3357                         RETURN(-EPERM);
3358
3359                 file2 = fget(lsl.sl_fd);
3360                 if (file2 == NULL)
3361                         RETURN(-EBADF);
3362
3363                 /* O_WRONLY or O_RDWR */
3364                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3365                         GOTO(out, rc = -EPERM);
3366
3367                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3368                         struct inode                    *inode2;
3369                         struct ll_inode_info            *lli;
3370                         struct obd_client_handle        *och = NULL;
3371
3372                         lli = ll_i2info(inode);
3373                         mutex_lock(&lli->lli_och_mutex);
3374                         if (fd->fd_lease_och != NULL) {
3375                                 och = fd->fd_lease_och;
3376                                 fd->fd_lease_och = NULL;
3377                         }
3378                         mutex_unlock(&lli->lli_och_mutex);
3379                         if (och == NULL)
3380                                 GOTO(out, rc = -ENOLCK);
3381                         inode2 = file_inode(file2);
3382                         rc = ll_swap_layouts_close(och, inode, inode2);
3383                 } else {
3384                         rc = ll_swap_layouts(file, file2, &lsl);
3385                 }
3386 out:
3387                 fput(file2);
3388                 RETURN(rc);
3389         }
3390         case LL_IOC_LOV_GETSTRIPE:
3391         case LL_IOC_LOV_GETSTRIPE_NEW:
3392                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3393         case FS_IOC_GETFLAGS:
3394         case FS_IOC_SETFLAGS:
3395                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3396         case FSFILT_IOC_GETVERSION:
3397         case FS_IOC_GETVERSION:
3398                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3399         /* We need to special case any other ioctls we want to handle,
3400          * to send them to the MDS/OST as appropriate and to properly
3401          * network encode the arg field. */
3402         case FS_IOC_SETVERSION:
3403                 RETURN(-ENOTSUPP);
3404
3405         case LL_IOC_GROUP_LOCK:
3406                 RETURN(ll_get_grouplock(inode, file, arg));
3407         case LL_IOC_GROUP_UNLOCK:
3408                 RETURN(ll_put_grouplock(inode, file, arg));
3409         case IOC_OBD_STATFS:
3410                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3411
3412         case LL_IOC_FLUSHCTX:
3413                 RETURN(ll_flush_ctx(inode));
3414         case LL_IOC_PATH2FID: {
3415                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3416                                  sizeof(struct lu_fid)))
3417                         RETURN(-EFAULT);
3418
3419                 RETURN(0);
3420         }
3421         case LL_IOC_GETPARENT:
3422                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3423
3424         case OBD_IOC_FID2PATH:
3425                 RETURN(ll_fid2path(inode, (void __user *)arg));
3426         case LL_IOC_DATA_VERSION: {
3427                 struct ioc_data_version idv;
3428                 int rc;
3429
3430                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3431                         RETURN(-EFAULT);
3432
3433                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3434                 rc = ll_ioc_data_version(inode, &idv);
3435
3436                 if (rc == 0 &&
3437                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3438                         RETURN(-EFAULT);
3439
3440                 RETURN(rc);
3441         }
3442
3443         case LL_IOC_GET_MDTIDX: {
3444                 int mdtidx;
3445
3446                 mdtidx = ll_get_mdt_idx(inode);
3447                 if (mdtidx < 0)
3448                         RETURN(mdtidx);
3449
3450                 if (put_user((int)mdtidx, (int __user *)arg))
3451                         RETURN(-EFAULT);
3452
3453                 RETURN(0);
3454         }
3455         case OBD_IOC_GETDTNAME:
3456         case OBD_IOC_GETMDNAME:
3457                 RETURN(ll_get_obd_name(inode, cmd, arg));
3458         case LL_IOC_HSM_STATE_GET: {
3459                 struct md_op_data       *op_data;
3460                 struct hsm_user_state   *hus;
3461                 int                      rc;
3462
3463                 OBD_ALLOC_PTR(hus);
3464                 if (hus == NULL)
3465                         RETURN(-ENOMEM);
3466
3467                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3468                                              LUSTRE_OPC_ANY, hus);
3469                 if (IS_ERR(op_data)) {
3470                         OBD_FREE_PTR(hus);
3471                         RETURN(PTR_ERR(op_data));
3472                 }
3473
3474                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3475                                    op_data, NULL);
3476
3477                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3478                         rc = -EFAULT;
3479
3480                 ll_finish_md_op_data(op_data);
3481                 OBD_FREE_PTR(hus);
3482                 RETURN(rc);
3483         }
3484         case LL_IOC_HSM_STATE_SET: {
3485                 struct hsm_state_set    *hss;
3486                 int                      rc;
3487
3488                 OBD_ALLOC_PTR(hss);
3489                 if (hss == NULL)
3490                         RETURN(-ENOMEM);
3491
3492                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3493                         OBD_FREE_PTR(hss);
3494                         RETURN(-EFAULT);
3495                 }
3496
3497                 rc = ll_hsm_state_set(inode, hss);
3498
3499                 OBD_FREE_PTR(hss);
3500                 RETURN(rc);
3501         }
3502         case LL_IOC_HSM_ACTION: {
3503                 struct md_op_data               *op_data;
3504                 struct hsm_current_action       *hca;
3505                 int                              rc;
3506
3507                 OBD_ALLOC_PTR(hca);
3508                 if (hca == NULL)
3509                         RETURN(-ENOMEM);
3510
3511                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3512                                              LUSTRE_OPC_ANY, hca);
3513                 if (IS_ERR(op_data)) {
3514                         OBD_FREE_PTR(hca);
3515                         RETURN(PTR_ERR(op_data));
3516                 }
3517
3518                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3519                                    op_data, NULL);
3520
3521                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3522                         rc = -EFAULT;
3523
3524                 ll_finish_md_op_data(op_data);
3525                 OBD_FREE_PTR(hca);
3526                 RETURN(rc);
3527         }
3528         case LL_IOC_SET_LEASE_OLD: {
3529                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3530
3531                 RETURN(ll_file_set_lease(file, &ioc, 0));
3532         }
3533         case LL_IOC_SET_LEASE: {
3534                 struct ll_ioc_lease ioc;
3535
3536                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3537                         RETURN(-EFAULT);
3538
3539                 RETURN(ll_file_set_lease(file, &ioc, arg));
3540         }
3541         case LL_IOC_GET_LEASE: {
3542                 struct ll_inode_info *lli = ll_i2info(inode);
3543                 struct ldlm_lock *lock = NULL;
3544                 fmode_t fmode = 0;
3545
3546                 mutex_lock(&lli->lli_och_mutex);
3547                 if (fd->fd_lease_och != NULL) {
3548                         struct obd_client_handle *och = fd->fd_lease_och;
3549
3550                         lock = ldlm_handle2lock(&och->och_lease_handle);
3551                         if (lock != NULL) {
3552                                 lock_res_and_lock(lock);
3553                                 if (!ldlm_is_cancel(lock))
3554                                         fmode = och->och_flags;
3555
3556                                 unlock_res_and_lock(lock);
3557                                 LDLM_LOCK_PUT(lock);
3558                         }
3559                 }
3560                 mutex_unlock(&lli->lli_och_mutex);
3561
3562                 RETURN(ll_lease_type_from_fmode(fmode));
3563         }
3564         case LL_IOC_HSM_IMPORT: {
3565                 struct hsm_user_import *hui;
3566
3567                 OBD_ALLOC_PTR(hui);
3568                 if (hui == NULL)
3569                         RETURN(-ENOMEM);
3570
3571                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3572                         OBD_FREE_PTR(hui);
3573                         RETURN(-EFAULT);
3574                 }
3575
3576                 rc = ll_hsm_import(inode, file, hui);
3577
3578                 OBD_FREE_PTR(hui);
3579                 RETURN(rc);
3580         }
3581         case LL_IOC_FUTIMES_3: {
3582                 struct ll_futimes_3 lfu;
3583
3584                 if (copy_from_user(&lfu,
3585                                    (const struct ll_futimes_3 __user *)arg,
3586                                    sizeof(lfu)))
3587                         RETURN(-EFAULT);
3588
3589                 RETURN(ll_file_futimes_3(file, &lfu));
3590         }
3591         case LL_IOC_LADVISE: {
3592                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3593                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3594                 int i;
3595                 int num_advise;
3596                 int alloc_size = sizeof(*k_ladvise_hdr);
3597
3598                 rc = 0;
3599                 u_ladvise_hdr = (void __user *)arg;
3600                 OBD_ALLOC_PTR(k_ladvise_hdr);
3601                 if (k_ladvise_hdr == NULL)
3602                         RETURN(-ENOMEM);
3603
3604                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3605                         GOTO(out_ladvise, rc = -EFAULT);
3606
3607                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3608                     k_ladvise_hdr->lah_count < 1)
3609                         GOTO(out_ladvise, rc = -EINVAL);
3610
3611                 num_advise = k_ladvise_hdr->lah_count;
3612                 if (num_advise >= LAH_COUNT_MAX)
3613                         GOTO(out_ladvise, rc = -EFBIG);
3614
3615                 OBD_FREE_PTR(k_ladvise_hdr);
3616                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3617                                       lah_advise[num_advise]);
3618                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3619                 if (k_ladvise_hdr == NULL)
3620                         RETURN(-ENOMEM);
3621
3622                 /*
3623                  * TODO: submit multiple advices to one server in a single RPC
3624                  */
3625                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3626                         GOTO(out_ladvise, rc = -EFAULT);
3627
3628                 for (i = 0; i < num_advise; i++) {
3629                         struct llapi_lu_ladvise *k_ladvise =
3630                                         &k_ladvise_hdr->lah_advise[i];
3631                         struct llapi_lu_ladvise __user *u_ladvise =
3632                                         &u_ladvise_hdr->lah_advise[i];
3633
3634                         rc = ll_ladvise_sanity(inode, k_ladvise);
3635                         if (rc)
3636                                 GOTO(out_ladvise, rc);
3637
3638                         switch (k_ladvise->lla_advice) {
3639                         case LU_LADVISE_LOCKNOEXPAND:
3640                                 rc = ll_lock_noexpand(file,
3641                                                k_ladvise->lla_peradvice_flags);
3642                                 GOTO(out_ladvise, rc);
3643                         case LU_LADVISE_LOCKAHEAD:
3644
3645                                 rc = ll_file_lock_ahead(file, k_ladvise);
3646
3647                                 if (rc < 0)
3648                                         GOTO(out_ladvise, rc);
3649
3650                                 if (put_user(rc,
3651                                              &u_ladvise->lla_lockahead_result))
3652                                         GOTO(out_ladvise, rc = -EFAULT);
3653                                 break;
3654                         default:
3655                                 rc = ll_ladvise(inode, file,
3656                                                 k_ladvise_hdr->lah_flags,
3657                                                 k_ladvise);
3658                                 if (rc)
3659                                         GOTO(out_ladvise, rc);
3660                                 break;
3661                         }
3662
3663                 }
3664
3665 out_ladvise:
3666                 OBD_FREE(k_ladvise_hdr, alloc_size);
3667                 RETURN(rc);
3668         }
3669         case LL_IOC_FLR_SET_MIRROR: {
3670                 /* mirror I/O must be direct to avoid polluting page cache
3671                  * by stale data. */
3672                 if (!(file->f_flags & O_DIRECT))
3673                         RETURN(-EINVAL);
3674
3675                 fd->fd_designated_mirror = (__u32)arg;
3676                 RETURN(0);
3677         }
3678         case LL_IOC_FSGETXATTR:
3679                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3680         case LL_IOC_FSSETXATTR:
3681                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3682         case BLKSSZGET:
3683                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3684         default:
3685                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3686                                      (void __user *)arg));
3687         }
3688 }
3689
3690 #ifndef HAVE_FILE_LLSEEK_SIZE
3691 static inline loff_t
3692 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3693 {
3694         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3695                 return -EINVAL;
3696         if (offset > maxsize)
3697                 return -EINVAL;
3698
3699         if (offset != file->f_pos) {
3700                 file->f_pos = offset;
3701                 file->f_version = 0;
3702         }
3703         return offset;
3704 }
3705
3706 static loff_t
3707 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3708                 loff_t maxsize, loff_t eof)
3709 {
3710         struct inode *inode = file_inode(file);
3711
3712         switch (origin) {
3713         case SEEK_END:
3714                 offset += eof;
3715                 break;
3716         case SEEK_CUR:
3717                 /*
3718                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3719                  * position-querying operation.  Avoid rewriting the "same"
3720                  * f_pos value back to the file because a concurrent read(),
3721                  * write() or lseek() might have altered it
3722                  */
3723                 if (offset == 0)
3724                         return file->f_pos;
3725                 /*
3726                  * f_lock protects against read/modify/write race with other
3727                  * SEEK_CURs. Note that parallel writes and reads behave
3728                  * like SEEK_SET.
3729                  */
3730                 inode_lock(inode);
3731                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3732                 inode_unlock(inode);
3733                 return offset;
3734         case SEEK_DATA:
3735                 /*
3736                  * In the generic case the entire file is data, so as long as
3737                  * offset isn't at the end of the file then the offset is data.
3738                  */
3739                 if (offset >= eof)
3740                         return -ENXIO;
3741                 break;
3742         case SEEK_HOLE:
3743                 /*
3744                  * There is a virtual hole at the end of the file, so as long as
3745                  * offset isn't i_size or larger, return i_size.
3746                  */
3747                 if (offset >= eof)
3748                         return -ENXIO;
3749                 offset = eof;
3750                 break;
3751         }
3752
3753         return llseek_execute(file, offset, maxsize);
3754 }
3755 #endif
3756
3757 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3758 {
3759         struct inode *inode = file_inode(file);
3760         loff_t retval, eof = 0;
3761
3762         ENTRY;
3763         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3764                            (origin == SEEK_CUR) ? file->f_pos : 0);
3765         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3766                PFID(ll_inode2fid(inode)), inode, retval, retval,
3767                origin);
3768         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3769
3770         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3771                 retval = ll_glimpse_size(inode);
3772                 if (retval != 0)
3773                         RETURN(retval);
3774                 eof = i_size_read(inode);
3775         }
3776
3777         retval = ll_generic_file_llseek_size(file, offset, origin,
3778                                           ll_file_maxbytes(inode), eof);
3779         RETURN(retval);
3780 }
3781
3782 static int ll_flush(struct file *file, fl_owner_t id)
3783 {
3784         struct inode *inode = file_inode(file);
3785         struct ll_inode_info *lli = ll_i2info(inode);
3786         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3787         int rc, err;
3788
3789         LASSERT(!S_ISDIR(inode->i_mode));
3790
3791         /* catch async errors that were recorded back when async writeback
3792          * failed for pages in this mapping. */
3793         rc = lli->lli_async_rc;
3794         lli->lli_async_rc = 0;
3795         if (lli->lli_clob != NULL) {
3796                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3797                 if (rc == 0)
3798                         rc = err;
3799         }
3800
3801         /* The application has been told write failure already.
3802          * Do not report failure again. */
3803         if (fd->fd_write_failed)
3804                 return 0;
3805         return rc ? -EIO : 0;
3806 }
3807
3808 /**
3809  * Called to make sure a portion of file has been written out.
3810  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3811  *
3812  * Return how many pages have been written.
3813  */
3814 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3815                        enum cl_fsync_mode mode, int ignore_layout)
3816 {
3817         struct lu_env *env;
3818         struct cl_io *io;
3819         struct cl_fsync_io *fio;
3820         int result;
3821         __u16 refcheck;
3822         ENTRY;
3823
3824         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3825             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3826                 RETURN(-EINVAL);
3827
3828         env = cl_env_get(&refcheck);
3829         if (IS_ERR(env))
3830                 RETURN(PTR_ERR(env));
3831
3832         io = vvp_env_thread_io(env);
3833         io->ci_obj = ll_i2info(inode)->lli_clob;
3834         io->ci_ignore_layout = ignore_layout;
3835
3836         /* initialize parameters for sync */
3837         fio = &io->u.ci_fsync;
3838         fio->fi_start = start;
3839         fio->fi_end = end;
3840         fio->fi_fid = ll_inode2fid(inode);
3841         fio->fi_mode = mode;
3842         fio->fi_nr_written = 0;
3843
3844         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3845                 result = cl_io_loop(env, io);
3846         else
3847                 result = io->ci_result;
3848         if (result == 0)
3849                 result = fio->fi_nr_written;
3850         cl_io_fini(env, io);
3851         cl_env_put(env, &refcheck);
3852
3853         RETURN(result);
3854 }
3855
3856 /*
3857  * When dentry is provided (the 'else' case), file_dentry() may be
3858  * null and dentry must be used directly rather than pulled from
3859  * file_dentry() as is done otherwise.
3860  */
3861
3862 #ifdef HAVE_FILE_FSYNC_4ARGS
3863 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3864 {
3865         struct dentry *dentry = file_dentry(file);
3866         bool lock_inode;
3867 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3868 int ll_fsync(struct file *file, int datasync)
3869 {
3870         struct dentry *dentry = file_dentry(file);
3871         loff_t start = 0;
3872         loff_t end = LLONG_MAX;
3873 #else
3874 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3875 {
3876         loff_t start = 0;
3877         loff_t end = LLONG_MAX;
3878 #endif
3879         struct inode *inode = dentry->d_inode;
3880         struct ll_inode_info *lli = ll_i2info(inode);
3881         struct ptlrpc_request *req;
3882         int rc, err;
3883         ENTRY;
3884
3885         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3886                PFID(ll_inode2fid(inode)), inode);
3887         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3888
3889 #ifdef HAVE_FILE_FSYNC_4ARGS
3890         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3891         lock_inode = !lli->lli_inode_locked;
3892         if (lock_inode)
3893                 inode_lock(inode);
3894 #else
3895         /* fsync's caller has already called _fdata{sync,write}, we want
3896          * that IO to finish before calling the osc and mdc sync methods */
3897         rc = filemap_fdatawait(inode->i_mapping);
3898 #endif
3899
3900         /* catch async errors that were recorded back when async writeback
3901          * failed for pages in this mapping. */
3902         if (!S_ISDIR(inode->i_mode)) {
3903                 err = lli->lli_async_rc;
3904                 lli->lli_async_rc = 0;
3905                 if (rc == 0)
3906                         rc = err;
3907                 if (lli->lli_clob != NULL) {
3908                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3909                         if (rc == 0)
3910                                 rc = err;
3911                 }
3912         }
3913
3914         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3915         if (!rc)
3916                 rc = err;
3917         if (!err)
3918                 ptlrpc_req_finished(req);
3919
3920         if (S_ISREG(inode->i_mode)) {
3921                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3922
3923                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3924                 if (rc == 0 && err < 0)
3925                         rc = err;
3926                 if (rc < 0)
3927                         fd->fd_write_failed = true;
3928                 else
3929                         fd->fd_write_failed = false;
3930         }
3931
3932 #ifdef HAVE_FILE_FSYNC_4ARGS
3933         if (lock_inode)
3934                 inode_unlock(inode);
3935 #endif
3936         RETURN(rc);
3937 }
3938
3939 static int
3940 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3941 {
3942         struct inode *inode = file_inode(file);
3943         struct ll_sb_info *sbi = ll_i2sbi(inode);
3944         struct ldlm_enqueue_info einfo = {
3945                 .ei_type        = LDLM_FLOCK,
3946                 .ei_cb_cp       = ldlm_flock_completion_ast,
3947                 .ei_cbdata      = file_lock,
3948         };
3949         struct md_op_data *op_data;
3950         struct lustre_handle lockh = { 0 };
3951         union ldlm_policy_data flock = { { 0 } };
3952         int fl_type = file_lock->fl_type;
3953         __u64 flags = 0;
3954         int rc;
3955         int rc2 = 0;
3956         ENTRY;
3957
3958         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3959                PFID(ll_inode2fid(inode)), file_lock);
3960
3961         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3962
3963         if (file_lock->fl_flags & FL_FLOCK) {
3964                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3965                 /* flocks are whole-file locks */
3966                 flock.l_flock.end = OFFSET_MAX;
3967                 /* For flocks owner is determined by the local file desctiptor*/
3968                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3969         } else if (file_lock->fl_flags & FL_POSIX) {
3970                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3971                 flock.l_flock.start = file_lock->fl_start;
3972                 flock.l_flock.end = file_lock->fl_end;
3973         } else {
3974                 RETURN(-EINVAL);
3975         }
3976         flock.l_flock.pid = file_lock->fl_pid;
3977
3978         /* Somewhat ugly workaround for svc lockd.
3979          * lockd installs custom fl_lmops->lm_compare_owner that checks
3980          * for the fl_owner to be the same (which it always is on local node
3981          * I guess between lockd processes) and then compares pid.
3982          * As such we assign pid to the owner field to make it all work,
3983          * conflict with normal locks is unlikely since pid space and
3984          * pointer space for current->files are not intersecting */
3985         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3986                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3987
3988         switch (fl_type) {
3989         case F_RDLCK:
3990                 einfo.ei_mode = LCK_PR;
3991                 break;
3992         case F_UNLCK:
3993                 /* An unlock request may or may not have any relation to
3994                  * existing locks so we may not be able to pass a lock handle
3995                  * via a normal ldlm_lock_cancel() request. The request may even
3996                  * unlock a byte range in the middle of an existing lock. In
3997                  * order to process an unlock request we need all of the same
3998                  * information that is given with a normal read or write record
3999                  * lock request. To avoid creating another ldlm unlock (cancel)
4000                  * message we'll treat a LCK_NL flock request as an unlock. */
4001                 einfo.ei_mode = LCK_NL;
4002                 break;
4003         case F_WRLCK:
4004                 einfo.ei_mode = LCK_PW;
4005                 break;
4006         default:
4007                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4008                 RETURN (-ENOTSUPP);
4009         }
4010
4011         switch (cmd) {
4012         case F_SETLKW:
4013 #ifdef F_SETLKW64
4014         case F_SETLKW64:
4015 #endif
4016                 flags = 0;
4017                 break;
4018         case F_SETLK:
4019 #ifdef F_SETLK64
4020         case F_SETLK64:
4021 #endif
4022                 flags = LDLM_FL_BLOCK_NOWAIT;
4023                 break;
4024         case F_GETLK:
4025 #ifdef F_GETLK64
4026         case F_GETLK64:
4027 #endif
4028                 flags = LDLM_FL_TEST_LOCK;
4029                 break;
4030         default:
4031                 CERROR("unknown fcntl lock command: %d\n", cmd);
4032                 RETURN (-EINVAL);
4033         }
4034
4035         /* Save the old mode so that if the mode in the lock changes we
4036          * can decrement the appropriate reader or writer refcount. */
4037         file_lock->fl_type = einfo.ei_mode;
4038
4039         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4040                                      LUSTRE_OPC_ANY, NULL);
4041         if (IS_ERR(op_data))
4042                 RETURN(PTR_ERR(op_data));
4043
4044         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4045                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4046                flock.l_flock.pid, flags, einfo.ei_mode,
4047                flock.l_flock.start, flock.l_flock.end);
4048
4049         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4050                         flags);
4051
4052         /* Restore the file lock type if not TEST lock. */
4053         if (!(flags & LDLM_FL_TEST_LOCK))
4054                 file_lock->fl_type = fl_type;
4055
4056 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4057         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4058             !(flags & LDLM_FL_TEST_LOCK))
4059                 rc2  = locks_lock_file_wait(file, file_lock);
4060 #else
4061         if ((file_lock->fl_flags & FL_FLOCK) &&
4062             (rc == 0 || file_lock->fl_type == F_UNLCK))
4063                 rc2  = flock_lock_file_wait(file, file_lock);
4064         if ((file_lock->fl_flags & FL_POSIX) &&
4065             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4066             !(flags & LDLM_FL_TEST_LOCK))
4067                 rc2  = posix_lock_file_wait(file, file_lock);
4068 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4069
4070         if (rc2 && file_lock->fl_type != F_UNLCK) {
4071                 einfo.ei_mode = LCK_NL;
4072                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4073                            &lockh, flags);
4074                 rc = rc2;
4075         }
4076
4077         ll_finish_md_op_data(op_data);
4078
4079         RETURN(rc);
4080 }
4081
4082 int ll_get_fid_by_name(struct inode *parent, const char *name,
4083                        int namelen, struct lu_fid *fid,
4084                        struct inode **inode)
4085 {
4086         struct md_op_data       *op_data = NULL;
4087         struct mdt_body         *body;
4088         struct ptlrpc_request   *req;
4089         int                     rc;
4090         ENTRY;
4091
4092         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4093                                      LUSTRE_OPC_ANY, NULL);
4094         if (IS_ERR(op_data))
4095                 RETURN(PTR_ERR(op_data));
4096
4097         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4098         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4099         ll_finish_md_op_data(op_data);
4100         if (rc < 0)
4101                 RETURN(rc);
4102
4103         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4104         if (body == NULL)
4105                 GOTO(out_req, rc = -EFAULT);
4106         if (fid != NULL)
4107                 *fid = body->mbo_fid1;
4108
4109         if (inode != NULL)
4110                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4111 out_req:
4112         ptlrpc_req_finished(req);
4113         RETURN(rc);
4114 }
4115
4116 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
4117                const char *name, int namelen)
4118 {
4119         struct dentry         *dchild = NULL;
4120         struct inode          *child_inode = NULL;
4121         struct md_op_data     *op_data;
4122         struct ptlrpc_request *request = NULL;
4123         struct obd_client_handle *och = NULL;
4124         struct qstr           qstr;
4125         struct mdt_body         *body;
4126         int                    rc;
4127         __u64                   data_version = 0;
4128         ENTRY;
4129
4130         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4131                name, PFID(ll_inode2fid(parent)), mdtidx);
4132
4133         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4134                                      0, LUSTRE_OPC_ANY, NULL);
4135         if (IS_ERR(op_data))
4136                 RETURN(PTR_ERR(op_data));
4137
4138         /* Get child FID first */
4139         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4140         qstr.name = name;
4141         qstr.len = namelen;
4142         dchild = d_lookup(file_dentry(file), &qstr);
4143         if (dchild != NULL) {
4144                 if (dchild->d_inode != NULL)
4145                         child_inode = igrab(dchild->d_inode);
4146                 dput(dchild);
4147         }
4148
4149         if (child_inode == NULL) {
4150                 rc = ll_get_fid_by_name(parent, name, namelen,
4151                                         &op_data->op_fid3, &child_inode);
4152                 if (rc != 0)
4153                         GOTO(out_free, rc);
4154         }
4155
4156         if (child_inode == NULL)
4157                 GOTO(out_free, rc = -EINVAL);
4158
4159         /*
4160          * lfs migrate command needs to be blocked on the client
4161          * by checking the migrate FID against the FID of the
4162          * filesystem root.
4163          */
4164         if (child_inode == parent->i_sb->s_root->d_inode)
4165                 GOTO(out_iput, rc = -EINVAL);
4166
4167         inode_lock(child_inode);
4168         op_data->op_fid3 = *ll_inode2fid(child_inode);
4169         if (!fid_is_sane(&op_data->op_fid3)) {
4170                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4171                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4172                        PFID(&op_data->op_fid3));
4173                 GOTO(out_unlock, rc = -EINVAL);
4174         }
4175
4176         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4177         if (rc < 0)
4178                 GOTO(out_unlock, rc);
4179
4180         if (rc == mdtidx) {
4181                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4182                        PFID(&op_data->op_fid3), mdtidx);
4183                 GOTO(out_unlock, rc = 0);
4184         }
4185 again:
4186         if (S_ISREG(child_inode->i_mode)) {
4187                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4188                 if (IS_ERR(och)) {
4189                         rc = PTR_ERR(och);
4190                         och = NULL;
4191                         GOTO(out_unlock, rc);
4192                 }
4193
4194                 rc = ll_data_version(child_inode, &data_version,
4195                                      LL_DV_WR_FLUSH);
4196                 if (rc != 0)
4197                         GOTO(out_close, rc);
4198
4199                 op_data->op_handle = och->och_fh;
4200                 op_data->op_data = och->och_mod;
4201                 op_data->op_data_version = data_version;
4202                 op_data->op_lease_handle = och->och_lease_handle;
4203                 op_data->op_bias |= MDS_RENAME_MIGRATE;
4204         }
4205
4206         op_data->op_mds = mdtidx;
4207         op_data->op_cli_flags = CLI_MIGRATE;
4208         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4209                        namelen, name, namelen, &request);
4210         if (rc == 0) {
4211                 LASSERT(request != NULL);
4212                 ll_update_times(request, parent);
4213
4214                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4215                 LASSERT(body != NULL);
4216
4217                 /* If the server does release layout lock, then we cleanup
4218                  * the client och here, otherwise release it in out_close: */
4219                 if (och != NULL &&
4220                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4221                         obd_mod_put(och->och_mod);
4222                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4223                                                   och);
4224                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4225                         OBD_FREE_PTR(och);
4226                         och = NULL;
4227                 }
4228         }
4229
4230         if (request != NULL) {
4231                 ptlrpc_req_finished(request);
4232                 request = NULL;
4233         }
4234
4235         /* Try again if the file layout has changed. */
4236         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4237                 goto again;
4238
4239 out_close:
4240         if (och != NULL) /* close the file */
4241                 ll_lease_close(och, child_inode, NULL);
4242         if (rc == 0)
4243                 clear_nlink(child_inode);
4244 out_unlock:
4245         inode_unlock(child_inode);
4246 out_iput:
4247         iput(child_inode);
4248 out_free:
4249         ll_finish_md_op_data(op_data);
4250         RETURN(rc);
4251 }
4252
4253 static int
4254 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4255 {
4256         ENTRY;
4257
4258         RETURN(-ENOSYS);
4259 }
4260
4261 /**
4262  * test if some locks matching bits and l_req_mode are acquired
4263  * - bits can be in different locks
4264  * - if found clear the common lock bits in *bits
4265  * - the bits not found, are kept in *bits
4266  * \param inode [IN]
4267  * \param bits [IN] searched lock bits [IN]
4268  * \param l_req_mode [IN] searched lock mode
4269  * \retval boolean, true iff all bits are found
4270  */
4271 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4272 {
4273         struct lustre_handle lockh;
4274         union ldlm_policy_data policy;
4275         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4276                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4277         struct lu_fid *fid;
4278         __u64 flags;
4279         int i;
4280         ENTRY;
4281
4282         if (!inode)
4283                RETURN(0);
4284
4285         fid = &ll_i2info(inode)->lli_fid;
4286         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4287                ldlm_lockname[mode]);
4288
4289         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4290         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4291                 policy.l_inodebits.bits = *bits & (1 << i);
4292                 if (policy.l_inodebits.bits == 0)
4293                         continue;
4294
4295                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4296                                   &policy, mode, &lockh)) {
4297                         struct ldlm_lock *lock;
4298
4299                         lock = ldlm_handle2lock(&lockh);
4300                         if (lock) {
4301                                 *bits &=
4302                                       ~(lock->l_policy_data.l_inodebits.bits);
4303                                 LDLM_LOCK_PUT(lock);
4304                         } else {
4305                                 *bits &= ~policy.l_inodebits.bits;
4306                         }
4307                 }
4308         }
4309         RETURN(*bits == 0);
4310 }
4311
4312 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4313                                struct lustre_handle *lockh, __u64 flags,
4314                                enum ldlm_mode mode)
4315 {
4316         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4317         struct lu_fid *fid;
4318         enum ldlm_mode rc;
4319         ENTRY;
4320
4321         fid = &ll_i2info(inode)->lli_fid;
4322         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4323
4324         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4325                            fid, LDLM_IBITS, &policy, mode, lockh);
4326
4327         RETURN(rc);
4328 }
4329
4330 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4331 {
4332         /* Already unlinked. Just update nlink and return success */
4333         if (rc == -ENOENT) {
4334                 clear_nlink(inode);
4335                 /* If it is striped directory, and there is bad stripe
4336                  * Let's revalidate the dentry again, instead of returning
4337                  * error */
4338                 if (S_ISDIR(inode->i_mode) &&
4339                     ll_i2info(inode)->lli_lsm_md != NULL)
4340                         return 0;
4341
4342                 /* This path cannot be hit for regular files unless in
4343                  * case of obscure races, so no need to to validate
4344                  * size. */
4345                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4346                         return 0;
4347         } else if (rc != 0) {
4348                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4349                              "%s: revalidate FID "DFID" error: rc = %d\n",
4350                              ll_get_fsname(inode->i_sb, NULL, 0),
4351                              PFID(ll_inode2fid(inode)), rc);
4352         }
4353
4354         return rc;
4355 }
4356
4357 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4358 {
4359         struct inode *inode = dentry->d_inode;
4360         struct obd_export *exp = ll_i2mdexp(inode);
4361         struct lookup_intent oit = {
4362                 .it_op = op,
4363         };
4364         struct ptlrpc_request *req = NULL;
4365         struct md_op_data *op_data;
4366         int rc = 0;
4367         ENTRY;
4368
4369         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4370                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4371
4372         /* Call getattr by fid, so do not provide name at all. */
4373         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4374                                      LUSTRE_OPC_ANY, NULL);
4375         if (IS_ERR(op_data))
4376                 RETURN(PTR_ERR(op_data));
4377
4378         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4379         ll_finish_md_op_data(op_data);
4380         if (rc < 0) {
4381                 rc = ll_inode_revalidate_fini(inode, rc);
4382                 GOTO(out, rc);
4383         }
4384
4385         rc = ll_revalidate_it_finish(req, &oit, dentry);
4386         if (rc != 0) {
4387                 ll_intent_release(&oit);
4388                 GOTO(out, rc);
4389         }
4390
4391         /* Unlinked? Unhash dentry, so it is not picked up later by
4392          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4393          * here to preserve get_cwd functionality on 2.6.
4394          * Bug 10503 */
4395         if (!dentry->d_inode->i_nlink) {
4396                 ll_lock_dcache(inode);
4397                 d_lustre_invalidate(dentry, 0);
4398                 ll_unlock_dcache(inode);
4399         }
4400
4401         ll_lookup_finish_locks(&oit, dentry);
4402 out:
4403         ptlrpc_req_finished(req);
4404
4405         return rc;
4406 }
4407
4408 static int ll_merge_md_attr(struct inode *inode)
4409 {
4410         struct cl_attr attr = { 0 };
4411         int rc;
4412
4413         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4414         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4415                            &attr, ll_md_blocking_ast);
4416         if (rc != 0)
4417                 RETURN(rc);
4418
4419         set_nlink(inode, attr.cat_nlink);
4420         inode->i_blocks = attr.cat_blocks;
4421         i_size_write(inode, attr.cat_size);
4422
4423         ll_i2info(inode)->lli_atime = attr.cat_atime;
4424         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4425         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4426
4427         RETURN(0);
4428 }
4429
4430 static inline dev_t ll_compat_encode_dev(dev_t dev)
4431 {
4432         /* The compat_sys_*stat*() syscalls will fail unless the
4433          * device majors and minors are both less than 256. Note that
4434          * the value returned here will be passed through
4435          * old_encode_dev() in cp_compat_stat(). And so we are not
4436          * trying to return a valid compat (u16) device number, just
4437          * one that will pass the old_valid_dev() check. */
4438
4439         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4440 }
4441
4442 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4443 int ll_getattr(const struct path *path, struct kstat *stat,
4444                u32 request_mask, unsigned int flags)
4445 {
4446         struct dentry *de = path->dentry;
4447 #else
4448 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4449 {
4450 #endif
4451         struct inode *inode = de->d_inode;
4452         struct ll_sb_info *sbi = ll_i2sbi(inode);
4453         struct ll_inode_info *lli = ll_i2info(inode);
4454         int rc;
4455
4456         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4457
4458         rc = ll_inode_revalidate(de, IT_GETATTR);
4459         if (rc < 0)
4460                 RETURN(rc);
4461
4462         if (S_ISREG(inode->i_mode)) {
4463                 /* In case of restore, the MDT has the right size and has
4464                  * already send it back without granting the layout lock,
4465                  * inode is up-to-date so glimpse is useless.
4466                  * Also to glimpse we need the layout, in case of a running
4467                  * restore the MDT holds the layout lock so the glimpse will
4468                  * block up to the end of restore (getattr will block)
4469                  */
4470                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4471                         rc = ll_glimpse_size(inode);
4472                         if (rc < 0)
4473                                 RETURN(rc);
4474                 }
4475         } else {
4476                 /* If object isn't regular a file then don't validate size. */
4477                 if (S_ISDIR(inode->i_mode) &&
4478                     lli->lli_lsm_md != NULL) {
4479                         rc = ll_merge_md_attr(inode);
4480                         if (rc < 0)
4481                                 RETURN(rc);
4482                 }
4483
4484                 LTIME_S(inode->i_atime) = lli->lli_atime;
4485                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4486                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4487         }
4488
4489         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4490
4491         if (ll_need_32bit_api(sbi)) {
4492                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4493                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4494                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4495         } else {
4496                 stat->ino = inode->i_ino;
4497                 stat->dev = inode->i_sb->s_dev;
4498                 stat->rdev = inode->i_rdev;
4499         }
4500
4501         stat->mode = inode->i_mode;
4502         stat->uid = inode->i_uid;
4503         stat->gid = inode->i_gid;
4504         stat->atime = inode->i_atime;
4505         stat->mtime = inode->i_mtime;
4506         stat->ctime = inode->i_ctime;
4507         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4508
4509         stat->nlink = inode->i_nlink;
4510         stat->size = i_size_read(inode);
4511         stat->blocks = inode->i_blocks;
4512
4513         return 0;
4514 }
4515
4516 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4517                      __u64 start, __u64 len)
4518 {
4519         int             rc;
4520         size_t          num_bytes;
4521         struct fiemap   *fiemap;
4522         unsigned int    extent_count = fieinfo->fi_extents_max;
4523
4524         num_bytes = sizeof(*fiemap) + (extent_count *
4525                                        sizeof(struct fiemap_extent));
4526         OBD_ALLOC_LARGE(fiemap, num_bytes);
4527
4528         if (fiemap == NULL)
4529                 RETURN(-ENOMEM);
4530
4531         fiemap->fm_flags = fieinfo->fi_flags;
4532         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4533         fiemap->fm_start = start;
4534         fiemap->fm_length = len;
4535         if (extent_count > 0 &&
4536             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4537                            sizeof(struct fiemap_extent)) != 0)
4538                 GOTO(out, rc = -EFAULT);
4539
4540         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4541
4542         fieinfo->fi_flags = fiemap->fm_flags;
4543         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4544         if (extent_count > 0 &&
4545             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4546                          fiemap->fm_mapped_extents *
4547                          sizeof(struct fiemap_extent)) != 0)
4548                 GOTO(out, rc = -EFAULT);
4549 out:
4550         OBD_FREE_LARGE(fiemap, num_bytes);
4551         return rc;
4552 }
4553
4554 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4555 {
4556         struct ll_inode_info *lli = ll_i2info(inode);
4557         struct posix_acl *acl = NULL;
4558         ENTRY;
4559
4560         spin_lock(&lli->lli_lock);
4561         /* VFS' acl_permission_check->check_acl will release the refcount */
4562         acl = posix_acl_dup(lli->lli_posix_acl);
4563         spin_unlock(&lli->lli_lock);
4564
4565         RETURN(acl);
4566 }
4567
4568 #ifdef HAVE_IOP_SET_ACL
4569 #ifdef CONFIG_FS_POSIX_ACL
4570 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4571 {
4572         struct ll_sb_info *sbi = ll_i2sbi(inode);
4573         struct ptlrpc_request *req = NULL;
4574         const char *name = NULL;
4575         char *value = NULL;
4576         size_t value_size = 0;
4577         int rc = 0;
4578         ENTRY;
4579
4580         switch (type) {
4581         case ACL_TYPE_ACCESS:
4582                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4583                 if (acl)
4584                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4585                 break;
4586
4587         case ACL_TYPE_DEFAULT:
4588                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4589                 if (!S_ISDIR(inode->i_mode))
4590                         rc = acl ? -EACCES : 0;
4591                 break;
4592
4593         default:
4594                 rc = -EINVAL;
4595                 break;
4596         }
4597         if (rc)
4598                 return rc;
4599
4600         if (acl) {
4601                 value_size = posix_acl_xattr_size(acl->a_count);
4602                 value = kmalloc(value_size, GFP_NOFS);
4603                 if (value == NULL)
4604                         GOTO(out, rc = -ENOMEM);
4605
4606                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4607                 if (rc < 0)
4608                         GOTO(out_value, rc);
4609         }
4610
4611         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4612                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4613                          name, value, value_size, 0, 0, &req);
4614
4615         ptlrpc_req_finished(req);
4616 out_value:
4617         kfree(value);
4618 out:
4619         if (rc)
4620                 forget_cached_acl(inode, type);
4621         else
4622                 set_cached_acl(inode, type, acl);
4623         RETURN(rc);
4624 }
4625 #endif /* CONFIG_FS_POSIX_ACL */
4626 #endif /* HAVE_IOP_SET_ACL */
4627
4628 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4629 static int
4630 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4631 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4632 # else
4633 ll_check_acl(struct inode *inode, int mask)
4634 # endif
4635 {
4636 # ifdef CONFIG_FS_POSIX_ACL
4637         struct posix_acl *acl;
4638         int rc;
4639         ENTRY;
4640
4641 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4642         if (flags & IPERM_FLAG_RCU)
4643                 return -ECHILD;
4644 #  endif
4645         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4646
4647         if (!acl)
4648                 RETURN(-EAGAIN);
4649
4650         rc = posix_acl_permission(inode, acl, mask);
4651         posix_acl_release(acl);
4652
4653         RETURN(rc);
4654 # else /* !CONFIG_FS_POSIX_ACL */
4655         return -EAGAIN;
4656 # endif /* CONFIG_FS_POSIX_ACL */
4657 }
4658 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4659
4660 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4661 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4662 #else
4663 # ifdef HAVE_INODE_PERMISION_2ARGS
4664 int ll_inode_permission(struct inode *inode, int mask)
4665 # else
4666 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4667 # endif
4668 #endif
4669 {
4670         int rc = 0;
4671         struct ll_sb_info *sbi;
4672         struct root_squash_info *squash;
4673         struct cred *cred = NULL;
4674         const struct cred *old_cred = NULL;
4675         cfs_cap_t cap;
4676         bool squash_id = false;
4677         ENTRY;
4678
4679 #ifdef MAY_NOT_BLOCK
4680         if (mask & MAY_NOT_BLOCK)
4681                 return -ECHILD;
4682 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4683         if (flags & IPERM_FLAG_RCU)
4684                 return -ECHILD;
4685 #endif
4686
4687        /* as root inode are NOT getting validated in lookup operation,
4688         * need to do it before permission check. */
4689
4690         if (inode == inode->i_sb->s_root->d_inode) {
4691                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4692                 if (rc)
4693                         RETURN(rc);
4694         }
4695
4696         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4697                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4698
4699         /* squash fsuid/fsgid if needed */
4700         sbi = ll_i2sbi(inode);
4701         squash = &sbi->ll_squash;
4702         if (unlikely(squash->rsi_uid != 0 &&
4703                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4704                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4705                         squash_id = true;
4706         }
4707         if (squash_id) {
4708                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4709                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4710                        squash->rsi_uid, squash->rsi_gid);
4711
4712                 /* update current process's credentials
4713                  * and FS capability */
4714                 cred = prepare_creds();
4715                 if (cred == NULL)
4716                         RETURN(-ENOMEM);
4717
4718                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4719                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4720                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4721                         if ((1 << cap) & CFS_CAP_FS_MASK)
4722                                 cap_lower(cred->cap_effective, cap);
4723                 }
4724                 old_cred = override_creds(cred);
4725         }
4726
4727         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4728         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4729         /* restore current process's credentials and FS capability */
4730         if (squash_id) {
4731                 revert_creds(old_cred);
4732                 put_cred(cred);
4733         }
4734
4735         RETURN(rc);
4736 }
4737
4738 /* -o localflock - only provides locally consistent flock locks */
4739 struct file_operations ll_file_operations = {
4740 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4741 # ifdef HAVE_SYNC_READ_WRITE
4742         .read           = new_sync_read,
4743         .write          = new_sync_write,
4744 # endif
4745         .read_iter      = ll_file_read_iter,
4746         .write_iter     = ll_file_write_iter,
4747 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4748         .read           = ll_file_read,
4749         .aio_read       = ll_file_aio_read,
4750         .write          = ll_file_write,
4751         .aio_write      = ll_file_aio_write,
4752 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4753         .unlocked_ioctl = ll_file_ioctl,
4754         .open           = ll_file_open,
4755         .release        = ll_file_release,
4756         .mmap           = ll_file_mmap,
4757         .llseek         = ll_file_seek,
4758         .splice_read    = ll_file_splice_read,
4759         .fsync          = ll_fsync,
4760         .flush          = ll_flush
4761 };
4762
4763 struct file_operations ll_file_operations_flock = {
4764 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4765 # ifdef HAVE_SYNC_READ_WRITE
4766         .read           = new_sync_read,
4767         .write          = new_sync_write,
4768 # endif /* HAVE_SYNC_READ_WRITE */
4769         .read_iter      = ll_file_read_iter,
4770         .write_iter     = ll_file_write_iter,
4771 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4772         .read           = ll_file_read,
4773         .aio_read       = ll_file_aio_read,
4774         .write          = ll_file_write,
4775         .aio_write      = ll_file_aio_write,
4776 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4777         .unlocked_ioctl = ll_file_ioctl,
4778         .open           = ll_file_open,
4779         .release        = ll_file_release,
4780         .mmap           = ll_file_mmap,
4781         .llseek         = ll_file_seek,
4782         .splice_read    = ll_file_splice_read,
4783         .fsync          = ll_fsync,
4784         .flush          = ll_flush,
4785         .flock          = ll_file_flock,
4786         .lock           = ll_file_flock
4787 };
4788
4789 /* These are for -o noflock - to return ENOSYS on flock calls */
4790 struct file_operations ll_file_operations_noflock = {
4791 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4792 # ifdef HAVE_SYNC_READ_WRITE
4793         .read           = new_sync_read,
4794         .write          = new_sync_write,
4795 # endif /* HAVE_SYNC_READ_WRITE */
4796         .read_iter      = ll_file_read_iter,
4797         .write_iter     = ll_file_write_iter,
4798 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4799         .read           = ll_file_read,
4800         .aio_read       = ll_file_aio_read,
4801         .write          = ll_file_write,
4802         .aio_write      = ll_file_aio_write,
4803 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4804         .unlocked_ioctl = ll_file_ioctl,
4805         .open           = ll_file_open,
4806         .release        = ll_file_release,
4807         .mmap           = ll_file_mmap,
4808         .llseek         = ll_file_seek,
4809         .splice_read    = ll_file_splice_read,
4810         .fsync          = ll_fsync,
4811         .flush          = ll_flush,
4812         .flock          = ll_file_noflock,
4813         .lock           = ll_file_noflock
4814 };
4815
4816 struct inode_operations ll_file_inode_operations = {
4817         .setattr        = ll_setattr,
4818         .getattr        = ll_getattr,
4819         .permission     = ll_inode_permission,
4820 #ifdef HAVE_IOP_XATTR
4821         .setxattr       = ll_setxattr,
4822         .getxattr       = ll_getxattr,
4823         .removexattr    = ll_removexattr,
4824 #endif
4825         .listxattr      = ll_listxattr,
4826         .fiemap         = ll_fiemap,
4827 #ifdef HAVE_IOP_GET_ACL
4828         .get_acl        = ll_get_acl,
4829 #endif
4830 #ifdef HAVE_IOP_SET_ACL
4831         .set_acl        = ll_set_acl,
4832 #endif
4833 };
4834
4835 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4836 {
4837         struct ll_inode_info *lli = ll_i2info(inode);
4838         struct cl_object *obj = lli->lli_clob;
4839         struct lu_env *env;
4840         int rc;
4841         __u16 refcheck;
4842         ENTRY;
4843
4844         if (obj == NULL)
4845                 RETURN(0);
4846
4847         env = cl_env_get(&refcheck);
4848         if (IS_ERR(env))
4849                 RETURN(PTR_ERR(env));
4850
4851         rc = cl_conf_set(env, lli->lli_clob, conf);
4852         if (rc < 0)
4853                 GOTO(out, rc);
4854
4855         if (conf->coc_opc == OBJECT_CONF_SET) {
4856                 struct ldlm_lock *lock = conf->coc_lock;
4857                 struct cl_layout cl = {
4858                         .cl_layout_gen = 0,
4859                 };
4860
4861                 LASSERT(lock != NULL);
4862                 LASSERT(ldlm_has_layout(lock));
4863
4864                 /* it can only be allowed to match after layout is
4865                  * applied to inode otherwise false layout would be
4866                  * seen. Applying layout shoud happen before dropping
4867                  * the intent lock. */
4868                 ldlm_lock_allow_match(lock);
4869
4870                 rc = cl_object_layout_get(env, obj, &cl);
4871                 if (rc < 0)
4872                         GOTO(out, rc);
4873
4874                 CDEBUG(D_VFSTRACE,
4875                        DFID": layout version change: %u -> %u\n",
4876                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4877                        cl.cl_layout_gen);
4878                 ll_layout_version_set(lli, cl.cl_layout_gen);
4879         }
4880
4881 out:
4882         cl_env_put(env, &refcheck);
4883
4884         RETURN(rc);
4885 }
4886
4887 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4888 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4889
4890 {
4891         struct ll_sb_info *sbi = ll_i2sbi(inode);
4892         struct ptlrpc_request *req;
4893         struct mdt_body *body;
4894         void *lvbdata;
4895         void *lmm;
4896         int lmmsize;
4897         int rc;
4898         ENTRY;
4899
4900         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4901                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4902                lock->l_lvb_data, lock->l_lvb_len);
4903
4904         if (lock->l_lvb_data != NULL)
4905                 RETURN(0);
4906
4907         /* if layout lock was granted right away, the layout is returned
4908          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4909          * blocked and then granted via completion ast, we have to fetch
4910          * layout here. Please note that we can't use the LVB buffer in
4911          * completion AST because it doesn't have a large enough buffer */
4912         rc = ll_get_default_mdsize(sbi, &lmmsize);
4913         if (rc == 0)
4914                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4915                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4916         if (rc < 0)
4917                 RETURN(rc);
4918
4919         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4920         if (body == NULL)
4921                 GOTO(out, rc = -EPROTO);
4922
4923         lmmsize = body->mbo_eadatasize;
4924         if (lmmsize == 0) /* empty layout */
4925                 GOTO(out, rc = 0);
4926
4927         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4928         if (lmm == NULL)
4929                 GOTO(out, rc = -EFAULT);
4930
4931         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4932         if (lvbdata == NULL)
4933                 GOTO(out, rc = -ENOMEM);
4934
4935         memcpy(lvbdata, lmm, lmmsize);
4936         lock_res_and_lock(lock);
4937         if (unlikely(lock->l_lvb_data == NULL)) {
4938                 lock->l_lvb_type = LVB_T_LAYOUT;
4939                 lock->l_lvb_data = lvbdata;
4940                 lock->l_lvb_len = lmmsize;
4941                 lvbdata = NULL;
4942         }
4943         unlock_res_and_lock(lock);
4944
4945         if (lvbdata)
4946                 OBD_FREE_LARGE(lvbdata, lmmsize);
4947
4948         EXIT;
4949
4950 out:
4951         ptlrpc_req_finished(req);
4952         return rc;
4953 }
4954
4955 /**
4956  * Apply the layout to the inode. Layout lock is held and will be released
4957  * in this function.
4958  */
4959 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4960                               struct inode *inode)
4961 {
4962         struct ll_inode_info *lli = ll_i2info(inode);
4963         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4964         struct ldlm_lock *lock;
4965         struct cl_object_conf conf;
4966         int rc = 0;
4967         bool lvb_ready;
4968         bool wait_layout = false;
4969         ENTRY;
4970
4971         LASSERT(lustre_handle_is_used(lockh));
4972
4973         lock = ldlm_handle2lock(lockh);
4974         LASSERT(lock != NULL);
4975         LASSERT(ldlm_has_layout(lock));
4976
4977         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4978                    PFID(&lli->lli_fid), inode);
4979
4980         /* in case this is a caching lock and reinstate with new inode */
4981         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4982
4983         lock_res_and_lock(lock);
4984         lvb_ready = ldlm_is_lvb_ready(lock);
4985         unlock_res_and_lock(lock);
4986
4987         /* checking lvb_ready is racy but this is okay. The worst case is
4988          * that multi processes may configure the file on the same time. */
4989         if (lvb_ready)
4990                 GOTO(out, rc = 0);
4991
4992         rc = ll_layout_fetch(inode, lock);
4993         if (rc < 0)
4994                 GOTO(out, rc);
4995
4996         /* for layout lock, lmm is stored in lock's lvb.
4997          * lvb_data is immutable if the lock is held so it's safe to access it
4998          * without res lock.
4999          *
5000          * set layout to file. Unlikely this will fail as old layout was
5001          * surely eliminated */
5002         memset(&conf, 0, sizeof conf);
5003         conf.coc_opc = OBJECT_CONF_SET;
5004         conf.coc_inode = inode;
5005         conf.coc_lock = lock;
5006         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5007         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5008         rc = ll_layout_conf(inode, &conf);
5009
5010         /* refresh layout failed, need to wait */
5011         wait_layout = rc == -EBUSY;
5012         EXIT;
5013 out:
5014         LDLM_LOCK_PUT(lock);
5015         ldlm_lock_decref(lockh, mode);
5016
5017         /* wait for IO to complete if it's still being used. */
5018         if (wait_layout) {
5019                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5020                        ll_get_fsname(inode->i_sb, NULL, 0),
5021                        PFID(&lli->lli_fid), inode);
5022
5023                 memset(&conf, 0, sizeof conf);
5024                 conf.coc_opc = OBJECT_CONF_WAIT;
5025                 conf.coc_inode = inode;
5026                 rc = ll_layout_conf(inode, &conf);
5027                 if (rc == 0)
5028                         rc = -EAGAIN;
5029
5030                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5031                        ll_get_fsname(inode->i_sb, NULL, 0),
5032                        PFID(&lli->lli_fid), rc);
5033         }
5034         RETURN(rc);
5035 }
5036
5037 /**
5038  * Issue layout intent RPC to MDS.
5039  * \param inode [in]    file inode
5040  * \param intent [in]   layout intent
5041  *
5042  * \retval 0    on success
5043  * \retval < 0  error code
5044  */
5045 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5046 {
5047         struct ll_inode_info  *lli = ll_i2info(inode);
5048         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5049         struct md_op_data     *op_data;
5050         struct lookup_intent it;
5051         struct ptlrpc_request *req;
5052         int rc;
5053         ENTRY;
5054
5055         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5056                                      0, 0, LUSTRE_OPC_ANY, NULL);
5057         if (IS_ERR(op_data))
5058                 RETURN(PTR_ERR(op_data));
5059
5060         op_data->op_data = intent;
5061         op_data->op_data_size = sizeof(*intent);
5062
5063         memset(&it, 0, sizeof(it));
5064         it.it_op = IT_LAYOUT;
5065         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5066             intent->li_opc == LAYOUT_INTENT_TRUNC)
5067                 it.it_flags = FMODE_WRITE;
5068
5069         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5070                           ll_get_fsname(inode->i_sb, NULL, 0),
5071                           PFID(&lli->lli_fid), inode);
5072
5073         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5074                             &ll_md_blocking_ast, 0);
5075         if (it.it_request != NULL)
5076                 ptlrpc_req_finished(it.it_request);
5077         it.it_request = NULL;
5078
5079         ll_finish_md_op_data(op_data);
5080
5081         /* set lock data in case this is a new lock */
5082         if (!rc)
5083                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5084
5085         ll_intent_drop_lock(&it);
5086
5087         RETURN(rc);
5088 }
5089
5090 /**
5091  * This function checks if there exists a LAYOUT lock on the client side,
5092  * or enqueues it if it doesn't have one in cache.
5093  *
5094  * This function will not hold layout lock so it may be revoked any time after
5095  * this function returns. Any operations depend on layout should be redone
5096  * in that case.
5097  *
5098  * This function should be called before lov_io_init() to get an uptodate
5099  * layout version, the caller should save the version number and after IO
5100  * is finished, this function should be called again to verify that layout
5101  * is not changed during IO time.
5102  */
5103 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5104 {
5105         struct ll_inode_info    *lli = ll_i2info(inode);
5106         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5107         struct lustre_handle lockh;
5108         struct layout_intent intent = {
5109                 .li_opc = LAYOUT_INTENT_ACCESS,
5110         };
5111         enum ldlm_mode mode;
5112         int rc;
5113         ENTRY;
5114
5115         *gen = ll_layout_version_get(lli);
5116         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5117                 RETURN(0);
5118
5119         /* sanity checks */
5120         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5121         LASSERT(S_ISREG(inode->i_mode));
5122
5123         /* take layout lock mutex to enqueue layout lock exclusively. */
5124         mutex_lock(&lli->lli_layout_mutex);
5125
5126         while (1) {
5127                 /* mostly layout lock is caching on the local side, so try to
5128                  * match it before grabbing layout lock mutex. */
5129                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5130                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5131                 if (mode != 0) { /* hit cached lock */
5132                         rc = ll_layout_lock_set(&lockh, mode, inode);
5133                         if (rc == -EAGAIN)
5134                                 continue;
5135                         break;
5136                 }
5137
5138                 rc = ll_layout_intent(inode, &intent);
5139                 if (rc != 0)
5140                         break;
5141         }
5142
5143         if (rc == 0)
5144                 *gen = ll_layout_version_get(lli);
5145         mutex_unlock(&lli->lli_layout_mutex);
5146
5147         RETURN(rc);
5148 }
5149
5150 /**
5151  * Issue layout intent RPC indicating where in a file an IO is about to write.
5152  *
5153  * \param[in] inode     file inode.
5154  * \param[in] ext       write range with start offset of fille in bytes where
5155  *                      an IO is about to write, and exclusive end offset in
5156  *                      bytes.
5157  *
5158  * \retval 0    on success
5159  * \retval < 0  error code
5160  */
5161 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5162                            struct lu_extent *ext)
5163 {
5164         struct layout_intent intent = {
5165                 .li_opc = opc,
5166                 .li_extent.e_start = ext->e_start,
5167                 .li_extent.e_end = ext->e_end,
5168         };
5169         int rc;
5170         ENTRY;
5171
5172         rc = ll_layout_intent(inode, &intent);
5173
5174         RETURN(rc);
5175 }
5176
5177 /**
5178  *  This function send a restore request to the MDT
5179  */
5180 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5181 {
5182         struct hsm_user_request *hur;
5183         int                      len, rc;
5184         ENTRY;
5185
5186         len = sizeof(struct hsm_user_request) +
5187               sizeof(struct hsm_user_item);
5188         OBD_ALLOC(hur, len);
5189         if (hur == NULL)
5190                 RETURN(-ENOMEM);
5191
5192         hur->hur_request.hr_action = HUA_RESTORE;
5193         hur->hur_request.hr_archive_id = 0;
5194         hur->hur_request.hr_flags = 0;
5195         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5196                sizeof(hur->hur_user_item[0].hui_fid));
5197         hur->hur_user_item[0].hui_extent.offset = offset;
5198         hur->hur_user_item[0].hui_extent.length = length;
5199         hur->hur_request.hr_itemcount = 1;
5200         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5201                            len, hur, NULL);
5202         OBD_FREE(hur, len);
5203         RETURN(rc);
5204 }