4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("Releasing a file %p with negative dentry %p. Name %s",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead.
386 * Different processes can open the same dir, "ll_opendir_key" means:
387 * it is me that should stop the statahead thread. */
388 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
389 lli->lli_opendir_pid != 0)
390 ll_stop_statahead(inode, lli->lli_opendir_key);
392 if (inode->i_sb->s_root == file->f_dentry) {
393 LUSTRE_FPRIVATE(file) = NULL;
394 ll_file_data_put(fd);
398 if (!S_ISDIR(inode->i_mode)) {
399 if (lli->lli_clob != NULL)
400 lov_read_and_clear_async_rc(lli->lli_clob);
401 lli->lli_async_rc = 0;
404 rc = ll_md_close(sbi->ll_md_exp, inode, file);
406 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
407 libcfs_debug_dumplog();
412 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
413 struct lookup_intent *itp)
415 struct dentry *de = file->f_dentry;
416 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
417 struct dentry *parent = de->d_parent;
418 const char *name = NULL;
420 struct md_op_data *op_data;
421 struct ptlrpc_request *req = NULL;
425 LASSERT(parent != NULL);
426 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
428 /* if server supports open-by-fid, or file name is invalid, don't pack
429 * name in open request */
430 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
431 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
432 name = de->d_name.name;
433 len = de->d_name.len;
436 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
437 name, len, 0, LUSTRE_OPC_ANY, NULL);
439 RETURN(PTR_ERR(op_data));
440 op_data->op_data = lmm;
441 op_data->op_data_size = lmmsize;
443 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
444 &ll_md_blocking_ast, 0);
445 ll_finish_md_op_data(op_data);
447 /* reason for keep own exit path - don`t flood log
448 * with messages with -ESTALE errors.
450 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
451 it_open_error(DISP_OPEN_OPEN, itp))
453 ll_release_openhandle(de, itp);
457 if (it_disposition(itp, DISP_LOOKUP_NEG))
458 GOTO(out, rc = -ENOENT);
460 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
461 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
462 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
466 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
467 if (!rc && itp->d.lustre.it_lock_mode)
468 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
471 ptlrpc_req_finished(req);
472 ll_intent_drop_lock(itp);
478 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
479 * not believe attributes if a few ioepoch holders exist. Attributes for
480 * previous ioepoch if new one is opened are also skipped by MDS.
482 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
484 if (ioepoch && lli->lli_ioepoch != ioepoch) {
485 lli->lli_ioepoch = ioepoch;
486 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
487 ioepoch, PFID(&lli->lli_fid));
491 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
492 struct obd_client_handle *och)
494 struct ptlrpc_request *req = it->d.lustre.it_data;
495 struct mdt_body *body;
497 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
498 och->och_fh = body->mbo_handle;
499 och->och_fid = body->mbo_fid1;
500 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
501 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
502 och->och_flags = it->it_flags;
504 return md_set_open_replay_data(md_exp, och, it);
507 static int ll_local_open(struct file *file, struct lookup_intent *it,
508 struct ll_file_data *fd, struct obd_client_handle *och)
510 struct inode *inode = file->f_dentry->d_inode;
511 struct ll_inode_info *lli = ll_i2info(inode);
514 LASSERT(!LUSTRE_FPRIVATE(file));
519 struct ptlrpc_request *req = it->d.lustre.it_data;
520 struct mdt_body *body;
523 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
527 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
528 ll_ioepoch_open(lli, body->mbo_ioepoch);
531 LUSTRE_FPRIVATE(file) = fd;
532 ll_readahead_init(inode, &fd->fd_ras);
533 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
535 /* ll_cl_context initialize */
536 rwlock_init(&fd->fd_lock);
537 INIT_LIST_HEAD(&fd->fd_lccs);
542 /* Open a file, and (for the very first open) create objects on the OSTs at
543 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
544 * creation or open until ll_lov_setstripe() ioctl is called.
546 * If we already have the stripe MD locally then we don't request it in
547 * md_open(), by passing a lmm_size = 0.
549 * It is up to the application to ensure no other processes open this file
550 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
551 * used. We might be able to avoid races of that sort by getting lli_open_sem
552 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
553 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
555 int ll_file_open(struct inode *inode, struct file *file)
557 struct ll_inode_info *lli = ll_i2info(inode);
558 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
559 .it_flags = file->f_flags };
560 struct obd_client_handle **och_p = NULL;
561 __u64 *och_usecount = NULL;
562 struct ll_file_data *fd;
563 int rc = 0, opendir_set = 0;
566 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
567 PFID(ll_inode2fid(inode)), inode, file->f_flags);
569 it = file->private_data; /* XXX: compat macro */
570 file->private_data = NULL; /* prevent ll_local_open assertion */
572 fd = ll_file_data_get();
574 GOTO(out_openerr, rc = -ENOMEM);
577 if (S_ISDIR(inode->i_mode)) {
578 spin_lock(&lli->lli_sa_lock);
579 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
580 lli->lli_opendir_pid == 0) {
581 lli->lli_opendir_key = fd;
582 lli->lli_opendir_pid = current_pid();
585 spin_unlock(&lli->lli_sa_lock);
588 if (inode->i_sb->s_root == file->f_dentry) {
589 LUSTRE_FPRIVATE(file) = fd;
593 if (!it || !it->d.lustre.it_disposition) {
594 /* Convert f_flags into access mode. We cannot use file->f_mode,
595 * because everything but O_ACCMODE mask was stripped from
597 if ((oit.it_flags + 1) & O_ACCMODE)
599 if (file->f_flags & O_TRUNC)
600 oit.it_flags |= FMODE_WRITE;
602 /* kernel only call f_op->open in dentry_open. filp_open calls
603 * dentry_open after call to open_namei that checks permissions.
604 * Only nfsd_open call dentry_open directly without checking
605 * permissions and because of that this code below is safe. */
606 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
607 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
609 /* We do not want O_EXCL here, presumably we opened the file
610 * already? XXX - NFS implications? */
611 oit.it_flags &= ~O_EXCL;
613 /* bug20584, if "it_flags" contains O_CREAT, the file will be
614 * created if necessary, then "IT_CREAT" should be set to keep
615 * consistent with it */
616 if (oit.it_flags & O_CREAT)
617 oit.it_op |= IT_CREAT;
623 /* Let's see if we have file open on MDS already. */
624 if (it->it_flags & FMODE_WRITE) {
625 och_p = &lli->lli_mds_write_och;
626 och_usecount = &lli->lli_open_fd_write_count;
627 } else if (it->it_flags & FMODE_EXEC) {
628 och_p = &lli->lli_mds_exec_och;
629 och_usecount = &lli->lli_open_fd_exec_count;
631 och_p = &lli->lli_mds_read_och;
632 och_usecount = &lli->lli_open_fd_read_count;
635 mutex_lock(&lli->lli_och_mutex);
636 if (*och_p) { /* Open handle is present */
637 if (it_disposition(it, DISP_OPEN_OPEN)) {
638 /* Well, there's extra open request that we do not need,
639 let's close it somehow. This will decref request. */
640 rc = it_open_error(DISP_OPEN_OPEN, it);
642 mutex_unlock(&lli->lli_och_mutex);
643 GOTO(out_openerr, rc);
646 ll_release_openhandle(file->f_dentry, it);
650 rc = ll_local_open(file, it, fd, NULL);
653 mutex_unlock(&lli->lli_och_mutex);
654 GOTO(out_openerr, rc);
657 LASSERT(*och_usecount == 0);
658 if (!it->d.lustre.it_disposition) {
659 /* We cannot just request lock handle now, new ELC code
660 means that one of other OPEN locks for this file
661 could be cancelled, and since blocking ast handler
662 would attempt to grab och_mutex as well, that would
663 result in a deadlock */
664 mutex_unlock(&lli->lli_och_mutex);
666 * Normally called under two situations:
668 * 2. revalidate with IT_OPEN (revalidate doesn't
669 * execute this intent any more).
671 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
673 * Always specify MDS_OPEN_BY_FID because we don't want
674 * to get file with different fid.
676 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
677 rc = ll_intent_file_open(file, NULL, 0, it);
679 GOTO(out_openerr, rc);
683 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
685 GOTO(out_och_free, rc = -ENOMEM);
689 /* md_intent_lock() didn't get a request ref if there was an
690 * open error, so don't do cleanup on the request here
692 /* XXX (green): Should not we bail out on any error here, not
693 * just open error? */
694 rc = it_open_error(DISP_OPEN_OPEN, it);
696 GOTO(out_och_free, rc);
698 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
699 "inode %p: disposition %x, status %d\n", inode,
700 it_disposition(it, ~0), it->d.lustre.it_status);
702 rc = ll_local_open(file, it, fd, *och_p);
704 GOTO(out_och_free, rc);
706 mutex_unlock(&lli->lli_och_mutex);
709 /* Must do this outside lli_och_mutex lock to prevent deadlock where
710 different kind of OPEN lock for this same inode gets cancelled
711 by ldlm_cancel_lru */
712 if (!S_ISREG(inode->i_mode))
713 GOTO(out_och_free, rc);
717 if (!lli->lli_has_smd &&
718 (cl_is_lov_delay_create(file->f_flags) ||
719 (file->f_mode & FMODE_WRITE) == 0)) {
720 CDEBUG(D_INODE, "object creation was delayed\n");
721 GOTO(out_och_free, rc);
723 cl_lov_delay_create_clear(&file->f_flags);
724 GOTO(out_och_free, rc);
728 if (och_p && *och_p) {
729 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
730 *och_p = NULL; /* OBD_FREE writes some magic there */
733 mutex_unlock(&lli->lli_och_mutex);
736 if (opendir_set != 0)
737 ll_stop_statahead(inode, lli->lli_opendir_key);
739 ll_file_data_put(fd);
741 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
744 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
745 ptlrpc_req_finished(it->d.lustre.it_data);
746 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
752 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
753 struct ldlm_lock_desc *desc, void *data, int flag)
756 struct lustre_handle lockh;
760 case LDLM_CB_BLOCKING:
761 ldlm_lock2handle(lock, &lockh);
762 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
764 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
768 case LDLM_CB_CANCELING:
776 * Acquire a lease and open the file.
778 static struct obd_client_handle *
779 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
782 struct lookup_intent it = { .it_op = IT_OPEN };
783 struct ll_sb_info *sbi = ll_i2sbi(inode);
784 struct md_op_data *op_data;
785 struct ptlrpc_request *req = NULL;
786 struct lustre_handle old_handle = { 0 };
787 struct obd_client_handle *och = NULL;
792 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
793 RETURN(ERR_PTR(-EINVAL));
796 struct ll_inode_info *lli = ll_i2info(inode);
797 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
798 struct obd_client_handle **och_p;
801 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
802 RETURN(ERR_PTR(-EPERM));
804 /* Get the openhandle of the file */
806 mutex_lock(&lli->lli_och_mutex);
807 if (fd->fd_lease_och != NULL) {
808 mutex_unlock(&lli->lli_och_mutex);
812 if (fd->fd_och == NULL) {
813 if (file->f_mode & FMODE_WRITE) {
814 LASSERT(lli->lli_mds_write_och != NULL);
815 och_p = &lli->lli_mds_write_och;
816 och_usecount = &lli->lli_open_fd_write_count;
818 LASSERT(lli->lli_mds_read_och != NULL);
819 och_p = &lli->lli_mds_read_och;
820 och_usecount = &lli->lli_open_fd_read_count;
822 if (*och_usecount == 1) {
829 mutex_unlock(&lli->lli_och_mutex);
830 if (rc < 0) /* more than 1 opener */
833 LASSERT(fd->fd_och != NULL);
834 old_handle = fd->fd_och->och_fh;
839 RETURN(ERR_PTR(-ENOMEM));
841 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
842 LUSTRE_OPC_ANY, NULL);
844 GOTO(out, rc = PTR_ERR(op_data));
846 /* To tell the MDT this openhandle is from the same owner */
847 op_data->op_handle = old_handle;
849 it.it_flags = fmode | open_flags;
850 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
851 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
852 &ll_md_blocking_lease_ast,
853 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
854 * it can be cancelled which may mislead applications that the lease is
856 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
857 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
858 * doesn't deal with openhandle, so normal openhandle will be leaked. */
859 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
860 ll_finish_md_op_data(op_data);
861 ptlrpc_req_finished(req);
863 GOTO(out_release_it, rc);
865 if (it_disposition(&it, DISP_LOOKUP_NEG))
866 GOTO(out_release_it, rc = -ENOENT);
868 rc = it_open_error(DISP_OPEN_OPEN, &it);
870 GOTO(out_release_it, rc);
872 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
873 ll_och_fill(sbi->ll_md_exp, &it, och);
875 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
876 GOTO(out_close, rc = -EOPNOTSUPP);
878 /* already get lease, handle lease lock */
879 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
880 if (it.d.lustre.it_lock_mode == 0 ||
881 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
882 /* open lock must return for lease */
883 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
884 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
885 it.d.lustre.it_lock_bits);
886 GOTO(out_close, rc = -EPROTO);
889 ll_intent_release(&it);
893 /* Cancel open lock */
894 if (it.d.lustre.it_lock_mode != 0) {
895 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
896 it.d.lustre.it_lock_mode);
897 it.d.lustre.it_lock_mode = 0;
898 och->och_lease_handle.cookie = 0ULL;
900 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
902 CERROR("%s: error closing file "DFID": %d\n",
903 ll_get_fsname(inode->i_sb, NULL, 0),
904 PFID(&ll_i2info(inode)->lli_fid), rc2);
905 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
907 ll_intent_release(&it);
915 * Release lease and close the file.
916 * It will check if the lease has ever broken.
918 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
921 struct ldlm_lock *lock;
922 bool cancelled = true;
926 lock = ldlm_handle2lock(&och->och_lease_handle);
928 lock_res_and_lock(lock);
929 cancelled = ldlm_is_cancel(lock);
930 unlock_res_and_lock(lock);
934 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
935 PFID(&ll_i2info(inode)->lli_fid), cancelled);
938 ldlm_cli_cancel(&och->och_lease_handle, 0);
939 if (lease_broken != NULL)
940 *lease_broken = cancelled;
942 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
947 /* Fills the obdo with the attributes for the lsm */
948 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
949 struct obd_capa *capa, struct obdo *obdo,
950 __u64 ioepoch, int dv_flags)
952 struct ptlrpc_request_set *set;
953 struct obd_info oinfo = { { { 0 } } };
958 LASSERT(lsm != NULL);
962 oinfo.oi_oa->o_oi = lsm->lsm_oi;
963 oinfo.oi_oa->o_mode = S_IFREG;
964 oinfo.oi_oa->o_ioepoch = ioepoch;
965 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
966 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
967 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
968 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
969 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
970 OBD_MD_FLDATAVERSION;
971 oinfo.oi_capa = capa;
972 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
973 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
974 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
975 if (dv_flags & LL_DV_WR_FLUSH)
976 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
979 set = ptlrpc_prep_set();
981 CERROR("can't allocate ptlrpc set\n");
984 rc = obd_getattr_async(exp, &oinfo, set);
986 rc = ptlrpc_set_wait(set);
987 ptlrpc_set_destroy(set);
990 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
991 OBD_MD_FLATIME | OBD_MD_FLMTIME |
992 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
993 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
994 if (dv_flags & LL_DV_WR_FLUSH &&
995 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
996 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1003 * Performs the getattr on the inode and updates its fields.
1004 * If @sync != 0, perform the getattr under the server-side lock.
1006 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1007 __u64 ioepoch, int sync)
1009 struct obd_capa *capa = ll_mdscapa_get(inode);
1010 struct lov_stripe_md *lsm;
1014 lsm = ccc_inode_lsm_get(inode);
1015 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1016 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1019 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1021 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1022 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1023 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1024 (unsigned long long)inode->i_blocks,
1025 (unsigned long)ll_inode_blksize(inode));
1027 ccc_inode_lsm_put(inode, lsm);
1031 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1033 struct ll_inode_info *lli = ll_i2info(inode);
1034 struct cl_object *obj = lli->lli_clob;
1035 struct cl_attr *attr = ccc_env_thread_attr(env);
1041 ll_inode_size_lock(inode);
1042 /* merge timestamps the most recently obtained from mds with
1043 timestamps obtained from osts */
1044 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1045 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1046 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1047 inode_init_lvb(inode, &lvb);
1049 cl_object_attr_lock(obj);
1050 rc = cl_object_attr_get(env, obj, attr);
1051 cl_object_attr_unlock(obj);
1054 if (lvb.lvb_atime < attr->cat_atime)
1055 lvb.lvb_atime = attr->cat_atime;
1056 if (lvb.lvb_ctime < attr->cat_ctime)
1057 lvb.lvb_ctime = attr->cat_ctime;
1058 if (lvb.lvb_mtime < attr->cat_mtime)
1059 lvb.lvb_mtime = attr->cat_mtime;
1061 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1062 PFID(&lli->lli_fid), attr->cat_size);
1063 cl_isize_write_nolock(inode, attr->cat_size);
1065 inode->i_blocks = attr->cat_blocks;
1067 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1068 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1069 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1071 ll_inode_size_unlock(inode);
1076 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1079 struct obdo obdo = { 0 };
1082 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1084 st->st_size = obdo.o_size;
1085 st->st_blocks = obdo.o_blocks;
1086 st->st_mtime = obdo.o_mtime;
1087 st->st_atime = obdo.o_atime;
1088 st->st_ctime = obdo.o_ctime;
1093 static bool file_is_noatime(const struct file *file)
1095 const struct vfsmount *mnt = file->f_path.mnt;
1096 const struct inode *inode = file->f_path.dentry->d_inode;
1098 /* Adapted from file_accessed() and touch_atime().*/
1099 if (file->f_flags & O_NOATIME)
1102 if (inode->i_flags & S_NOATIME)
1105 if (IS_NOATIME(inode))
1108 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1111 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1114 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1120 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1122 struct inode *inode = file->f_dentry->d_inode;
1124 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1126 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1127 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1128 file->f_flags & O_DIRECT ||
1131 io->ci_obj = ll_i2info(inode)->lli_clob;
1132 io->ci_lockreq = CILR_MAYBE;
1133 if (ll_file_nolock(file)) {
1134 io->ci_lockreq = CILR_NEVER;
1135 io->ci_no_srvlock = 1;
1136 } else if (file->f_flags & O_APPEND) {
1137 io->ci_lockreq = CILR_MANDATORY;
1140 io->ci_noatime = file_is_noatime(file);
1144 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1145 struct file *file, enum cl_io_type iot,
1146 loff_t *ppos, size_t count)
1148 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1149 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1154 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1155 file->f_dentry->d_name.name, iot, *ppos, count);
1158 io = ccc_env_thread_io(env);
1159 ll_io_init(io, file, iot == CIT_WRITE);
1161 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1162 struct vvp_io *vio = vvp_env_io(env);
1163 struct ccc_io *cio = ccc_env_io(env);
1164 int write_mutex_locked = 0;
1166 cio->cui_fd = LUSTRE_FPRIVATE(file);
1167 vio->cui_io_subtype = args->via_io_subtype;
1169 switch (vio->cui_io_subtype) {
1171 cio->cui_iov = args->u.normal.via_iov;
1172 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1173 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1174 cio->cui_iocb = args->u.normal.via_iocb;
1175 if ((iot == CIT_WRITE) &&
1176 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1177 if (mutex_lock_interruptible(&lli->
1179 GOTO(out, result = -ERESTARTSYS);
1180 write_mutex_locked = 1;
1182 down_read(&lli->lli_trunc_sem);
1185 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1186 vio->u.splice.cui_flags = args->u.splice.via_flags;
1189 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1193 ll_cl_add(file, env, io);
1194 result = cl_io_loop(env, io);
1195 ll_cl_remove(file, env);
1197 if (args->via_io_subtype == IO_NORMAL)
1198 up_read(&lli->lli_trunc_sem);
1199 if (write_mutex_locked)
1200 mutex_unlock(&lli->lli_write_mutex);
1202 /* cl_io_rw_init() handled IO */
1203 result = io->ci_result;
1206 if (io->ci_nob > 0) {
1207 result = io->ci_nob;
1208 *ppos = io->u.ci_wr.wr.crw_pos;
1212 cl_io_fini(env, io);
1213 /* If any bit been read/written (result != 0), we just return
1214 * short read/write instead of restart io. */
1215 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1216 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1217 iot == CIT_READ ? "read" : "write",
1218 file->f_dentry->d_name.name, *ppos, count);
1219 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1223 if (iot == CIT_READ) {
1225 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1226 LPROC_LL_READ_BYTES, result);
1227 } else if (iot == CIT_WRITE) {
1229 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1230 LPROC_LL_WRITE_BYTES, result);
1231 fd->fd_write_failed = false;
1232 } else if (result != -ERESTARTSYS) {
1233 fd->fd_write_failed = true;
1236 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1243 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1245 static int ll_file_get_iov_count(const struct iovec *iov,
1246 unsigned long *nr_segs, size_t *count)
1251 for (seg = 0; seg < *nr_segs; seg++) {
1252 const struct iovec *iv = &iov[seg];
1255 * If any segment has a negative length, or the cumulative
1256 * length ever wraps negative then return -EINVAL.
1259 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1261 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1266 cnt -= iv->iov_len; /* This segment is no good */
1273 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1274 unsigned long nr_segs, loff_t pos)
1277 struct vvp_io_args *args;
1283 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1287 env = cl_env_get(&refcheck);
1289 RETURN(PTR_ERR(env));
1291 args = vvp_env_args(env, IO_NORMAL);
1292 args->u.normal.via_iov = (struct iovec *)iov;
1293 args->u.normal.via_nrsegs = nr_segs;
1294 args->u.normal.via_iocb = iocb;
1296 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1297 &iocb->ki_pos, count);
1298 cl_env_put(env, &refcheck);
1302 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1306 struct iovec *local_iov;
1307 struct kiocb *kiocb;
1312 env = cl_env_get(&refcheck);
1314 RETURN(PTR_ERR(env));
1316 local_iov = &vvp_env_info(env)->vti_local_iov;
1317 kiocb = &vvp_env_info(env)->vti_kiocb;
1318 local_iov->iov_base = (void __user *)buf;
1319 local_iov->iov_len = count;
1320 init_sync_kiocb(kiocb, file);
1321 kiocb->ki_pos = *ppos;
1322 #ifdef HAVE_KIOCB_KI_LEFT
1323 kiocb->ki_left = count;
1325 kiocb->ki_nbytes = count;
1328 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1329 *ppos = kiocb->ki_pos;
1331 cl_env_put(env, &refcheck);
1336 * Write to a file (through the page cache).
1339 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1340 unsigned long nr_segs, loff_t pos)
1343 struct vvp_io_args *args;
1349 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1353 env = cl_env_get(&refcheck);
1355 RETURN(PTR_ERR(env));
1357 args = vvp_env_args(env, IO_NORMAL);
1358 args->u.normal.via_iov = (struct iovec *)iov;
1359 args->u.normal.via_nrsegs = nr_segs;
1360 args->u.normal.via_iocb = iocb;
1362 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1363 &iocb->ki_pos, count);
1364 cl_env_put(env, &refcheck);
1368 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1372 struct iovec *local_iov;
1373 struct kiocb *kiocb;
1378 env = cl_env_get(&refcheck);
1380 RETURN(PTR_ERR(env));
1382 local_iov = &vvp_env_info(env)->vti_local_iov;
1383 kiocb = &vvp_env_info(env)->vti_kiocb;
1384 local_iov->iov_base = (void __user *)buf;
1385 local_iov->iov_len = count;
1386 init_sync_kiocb(kiocb, file);
1387 kiocb->ki_pos = *ppos;
1388 #ifdef HAVE_KIOCB_KI_LEFT
1389 kiocb->ki_left = count;
1391 kiocb->ki_nbytes = count;
1394 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1395 *ppos = kiocb->ki_pos;
1397 cl_env_put(env, &refcheck);
1402 * Send file content (through pagecache) somewhere with helper
1404 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1405 struct pipe_inode_info *pipe, size_t count,
1409 struct vvp_io_args *args;
1414 env = cl_env_get(&refcheck);
1416 RETURN(PTR_ERR(env));
1418 args = vvp_env_args(env, IO_SPLICE);
1419 args->u.splice.via_pipe = pipe;
1420 args->u.splice.via_flags = flags;
1422 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1423 cl_env_put(env, &refcheck);
1427 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1430 struct obd_export *exp = ll_i2dtexp(inode);
1431 struct obd_trans_info oti = { 0 };
1432 struct obdo *oa = NULL;
1435 struct lov_stripe_md *lsm = NULL, *lsm2;
1442 lsm = ccc_inode_lsm_get(inode);
1443 if (!lsm_has_objects(lsm))
1444 GOTO(out, rc = -ENOENT);
1446 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1447 (lsm->lsm_stripe_count));
1449 OBD_ALLOC_LARGE(lsm2, lsm_size);
1451 GOTO(out, rc = -ENOMEM);
1454 oa->o_nlink = ost_idx;
1455 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1456 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1457 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1458 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1459 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1460 memcpy(lsm2, lsm, lsm_size);
1461 ll_inode_size_lock(inode);
1462 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1463 ll_inode_size_unlock(inode);
1465 OBD_FREE_LARGE(lsm2, lsm_size);
1468 ccc_inode_lsm_put(inode, lsm);
1473 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1475 struct ll_recreate_obj ucreat;
1479 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1482 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1486 ostid_set_seq_mdt0(&oi);
1487 ostid_set_id(&oi, ucreat.lrc_id);
1488 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1491 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1498 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1501 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1504 fid_to_ostid(&fid, &oi);
1505 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1506 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1509 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1510 __u64 flags, struct lov_user_md *lum,
1513 struct lov_stripe_md *lsm = NULL;
1514 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1518 lsm = ccc_inode_lsm_get(inode);
1520 ccc_inode_lsm_put(inode, lsm);
1521 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1522 PFID(ll_inode2fid(inode)));
1523 GOTO(out, rc = -EEXIST);
1526 ll_inode_size_lock(inode);
1527 oit.it_flags |= MDS_OPEN_BY_FID;
1528 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1530 GOTO(out_unlock, rc);
1531 rc = oit.d.lustre.it_status;
1533 GOTO(out_req_free, rc);
1535 ll_release_openhandle(file->f_dentry, &oit);
1538 ll_inode_size_unlock(inode);
1539 ll_intent_release(&oit);
1540 ccc_inode_lsm_put(inode, lsm);
1542 cl_lov_delay_create_clear(&file->f_flags);
1545 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1549 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1550 struct lov_mds_md **lmmp, int *lmm_size,
1551 struct ptlrpc_request **request)
1553 struct ll_sb_info *sbi = ll_i2sbi(inode);
1554 struct mdt_body *body;
1555 struct lov_mds_md *lmm = NULL;
1556 struct ptlrpc_request *req = NULL;
1557 struct md_op_data *op_data;
1560 rc = ll_get_default_mdsize(sbi, &lmmsize);
1564 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1565 strlen(filename), lmmsize,
1566 LUSTRE_OPC_ANY, NULL);
1567 if (IS_ERR(op_data))
1568 RETURN(PTR_ERR(op_data));
1570 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1571 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1572 ll_finish_md_op_data(op_data);
1574 CDEBUG(D_INFO, "md_getattr_name failed "
1575 "on %s: rc %d\n", filename, rc);
1579 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1580 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1582 lmmsize = body->mbo_eadatasize;
1584 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1586 GOTO(out, rc = -ENODATA);
1589 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1590 LASSERT(lmm != NULL);
1592 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1593 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1594 GOTO(out, rc = -EPROTO);
1598 * This is coming from the MDS, so is probably in
1599 * little endian. We convert it to host endian before
1600 * passing it to userspace.
1602 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1605 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1606 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1609 /* if function called for directory - we should
1610 * avoid swab not existent lsm objects */
1611 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1612 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1613 if (S_ISREG(body->mbo_mode))
1614 lustre_swab_lov_user_md_objects(
1615 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1617 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1618 lustre_swab_lov_user_md_v3(
1619 (struct lov_user_md_v3 *)lmm);
1620 if (S_ISREG(body->mbo_mode))
1621 lustre_swab_lov_user_md_objects(
1622 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1629 *lmm_size = lmmsize;
1634 static int ll_lov_setea(struct inode *inode, struct file *file,
1637 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1638 struct lov_user_md *lump;
1639 int lum_size = sizeof(struct lov_user_md) +
1640 sizeof(struct lov_user_ost_data);
1644 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1647 OBD_ALLOC_LARGE(lump, lum_size);
1651 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1652 OBD_FREE_LARGE(lump, lum_size);
1656 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1658 OBD_FREE_LARGE(lump, lum_size);
1662 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1665 struct lov_user_md_v3 lumv3;
1666 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1667 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1668 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1670 __u64 flags = FMODE_WRITE;
1673 /* first try with v1 which is smaller than v3 */
1674 lum_size = sizeof(struct lov_user_md_v1);
1675 if (copy_from_user(lumv1, lumv1p, lum_size))
1678 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1679 lum_size = sizeof(struct lov_user_md_v3);
1680 if (copy_from_user(&lumv3, lumv3p, lum_size))
1684 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1686 struct lov_stripe_md *lsm;
1689 put_user(0, &lumv1p->lmm_stripe_count);
1691 ll_layout_refresh(inode, &gen);
1692 lsm = ccc_inode_lsm_get(inode);
1693 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1694 0, lsm, (void *)arg);
1695 ccc_inode_lsm_put(inode, lsm);
1700 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1702 struct lov_stripe_md *lsm;
1706 lsm = ccc_inode_lsm_get(inode);
1708 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1710 ccc_inode_lsm_put(inode, lsm);
1715 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1717 struct ll_inode_info *lli = ll_i2info(inode);
1718 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1719 struct ccc_grouplock grouplock;
1723 if (ll_file_nolock(file))
1724 RETURN(-EOPNOTSUPP);
1726 spin_lock(&lli->lli_lock);
1727 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1728 CWARN("group lock already existed with gid %lu\n",
1729 fd->fd_grouplock.cg_gid);
1730 spin_unlock(&lli->lli_lock);
1733 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1734 spin_unlock(&lli->lli_lock);
1736 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1737 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1741 spin_lock(&lli->lli_lock);
1742 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1743 spin_unlock(&lli->lli_lock);
1744 CERROR("another thread just won the race\n");
1745 cl_put_grouplock(&grouplock);
1749 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1750 fd->fd_grouplock = grouplock;
1751 spin_unlock(&lli->lli_lock);
1753 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1757 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1759 struct ll_inode_info *lli = ll_i2info(inode);
1760 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1761 struct ccc_grouplock grouplock;
1764 spin_lock(&lli->lli_lock);
1765 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1766 spin_unlock(&lli->lli_lock);
1767 CWARN("no group lock held\n");
1770 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1772 if (fd->fd_grouplock.cg_gid != arg) {
1773 CWARN("group lock %lu doesn't match current id %lu\n",
1774 arg, fd->fd_grouplock.cg_gid);
1775 spin_unlock(&lli->lli_lock);
1779 grouplock = fd->fd_grouplock;
1780 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1781 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1782 spin_unlock(&lli->lli_lock);
1784 cl_put_grouplock(&grouplock);
1785 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1790 * Close inode open handle
1792 * \param dentry [in] dentry which contains the inode
1793 * \param it [in,out] intent which contains open info and result
1796 * \retval <0 failure
1798 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1800 struct inode *inode = dentry->d_inode;
1801 struct obd_client_handle *och;
1807 /* Root ? Do nothing. */
1808 if (dentry->d_inode->i_sb->s_root == dentry)
1811 /* No open handle to close? Move away */
1812 if (!it_disposition(it, DISP_OPEN_OPEN))
1815 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1817 OBD_ALLOC(och, sizeof(*och));
1819 GOTO(out, rc = -ENOMEM);
1821 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1823 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1826 /* this one is in place of ll_file_open */
1827 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1828 ptlrpc_req_finished(it->d.lustre.it_data);
1829 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1835 * Get size for inode for which FIEMAP mapping is requested.
1836 * Make the FIEMAP get_info call and returns the result.
1838 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1841 struct obd_export *exp = ll_i2dtexp(inode);
1842 struct lov_stripe_md *lsm = NULL;
1843 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1844 __u32 vallen = num_bytes;
1848 /* Checks for fiemap flags */
1849 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1850 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1854 /* Check for FIEMAP_FLAG_SYNC */
1855 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1856 rc = filemap_fdatawrite(inode->i_mapping);
1861 lsm = ccc_inode_lsm_get(inode);
1865 /* If the stripe_count > 1 and the application does not understand
1866 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1868 if (lsm->lsm_stripe_count > 1 &&
1869 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1870 GOTO(out, rc = -EOPNOTSUPP);
1872 fm_key.oa.o_oi = lsm->lsm_oi;
1873 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1875 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1876 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1877 /* If filesize is 0, then there would be no objects for mapping */
1878 if (fm_key.oa.o_size == 0) {
1879 fiemap->fm_mapped_extents = 0;
1883 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1885 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1888 CERROR("obd_get_info failed: rc = %d\n", rc);
1891 ccc_inode_lsm_put(inode, lsm);
1895 int ll_fid2path(struct inode *inode, void *arg)
1897 struct obd_export *exp = ll_i2mdexp(inode);
1898 struct getinfo_fid2path *gfout, *gfin;
1902 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1903 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1906 /* Need to get the buflen */
1907 OBD_ALLOC_PTR(gfin);
1910 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1915 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1916 OBD_ALLOC(gfout, outsize);
1917 if (gfout == NULL) {
1921 memcpy(gfout, gfin, sizeof(*gfout));
1924 /* Call mdc_iocontrol */
1925 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1929 if (copy_to_user(arg, gfout, outsize))
1933 OBD_FREE(gfout, outsize);
1937 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1939 struct ll_user_fiemap *fiemap_s;
1940 size_t num_bytes, ret_bytes;
1941 unsigned int extent_count;
1944 /* Get the extent count so we can calculate the size of
1945 * required fiemap buffer */
1946 if (get_user(extent_count,
1947 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1949 num_bytes = sizeof(*fiemap_s) + (extent_count *
1950 sizeof(struct ll_fiemap_extent));
1952 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1953 if (fiemap_s == NULL)
1956 /* get the fiemap value */
1957 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1959 GOTO(error, rc = -EFAULT);
1961 /* If fm_extent_count is non-zero, read the first extent since
1962 * it is used to calculate end_offset and device from previous
1965 if (copy_from_user(&fiemap_s->fm_extents[0],
1966 (char __user *)arg + sizeof(*fiemap_s),
1967 sizeof(struct ll_fiemap_extent)))
1968 GOTO(error, rc = -EFAULT);
1971 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1975 ret_bytes = sizeof(struct ll_user_fiemap);
1977 if (extent_count != 0)
1978 ret_bytes += (fiemap_s->fm_mapped_extents *
1979 sizeof(struct ll_fiemap_extent));
1981 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1985 OBD_FREE_LARGE(fiemap_s, num_bytes);
1990 * Read the data_version for inode.
1992 * This value is computed using stripe object version on OST.
1993 * Version is computed using server side locking.
1995 * @param sync if do sync on the OST side;
1997 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1998 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2000 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2002 struct lov_stripe_md *lsm = NULL;
2003 struct ll_sb_info *sbi = ll_i2sbi(inode);
2004 struct obdo *obdo = NULL;
2008 /* If no stripe, we consider version is 0. */
2009 lsm = ccc_inode_lsm_get(inode);
2010 if (!lsm_has_objects(lsm)) {
2012 CDEBUG(D_INODE, "No object for inode\n");
2016 OBD_ALLOC_PTR(obdo);
2018 GOTO(out, rc = -ENOMEM);
2020 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2022 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2025 *data_version = obdo->o_data_version;
2031 ccc_inode_lsm_put(inode, lsm);
2036 * Trigger a HSM release request for the provided inode.
2038 int ll_hsm_release(struct inode *inode)
2040 struct cl_env_nest nest;
2042 struct obd_client_handle *och = NULL;
2043 __u64 data_version = 0;
2047 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2048 ll_get_fsname(inode->i_sb, NULL, 0),
2049 PFID(&ll_i2info(inode)->lli_fid));
2051 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2053 GOTO(out, rc = PTR_ERR(och));
2055 /* Grab latest data_version and [am]time values */
2056 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2060 env = cl_env_nested_get(&nest);
2062 GOTO(out, rc = PTR_ERR(env));
2064 ll_merge_lvb(env, inode);
2065 cl_env_nested_put(&nest, env);
2067 /* Release the file.
2068 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2069 * we still need it to pack l_remote_handle to MDT. */
2070 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2076 if (och != NULL && !IS_ERR(och)) /* close the file */
2077 ll_lease_close(och, inode, NULL);
2082 struct ll_swap_stack {
2083 struct iattr ia1, ia2;
2085 struct inode *inode1, *inode2;
2086 bool check_dv1, check_dv2;
2089 static int ll_swap_layouts(struct file *file1, struct file *file2,
2090 struct lustre_swap_layouts *lsl)
2092 struct mdc_swap_layouts msl;
2093 struct md_op_data *op_data;
2096 struct ll_swap_stack *llss = NULL;
2099 OBD_ALLOC_PTR(llss);
2103 llss->inode1 = file1->f_dentry->d_inode;
2104 llss->inode2 = file2->f_dentry->d_inode;
2106 if (!S_ISREG(llss->inode2->i_mode))
2107 GOTO(free, rc = -EINVAL);
2109 if (inode_permission(llss->inode1, MAY_WRITE) ||
2110 inode_permission(llss->inode2, MAY_WRITE))
2111 GOTO(free, rc = -EPERM);
2113 if (llss->inode2->i_sb != llss->inode1->i_sb)
2114 GOTO(free, rc = -EXDEV);
2116 /* we use 2 bool because it is easier to swap than 2 bits */
2117 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2118 llss->check_dv1 = true;
2120 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2121 llss->check_dv2 = true;
2123 /* we cannot use lsl->sl_dvX directly because we may swap them */
2124 llss->dv1 = lsl->sl_dv1;
2125 llss->dv2 = lsl->sl_dv2;
2127 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2128 if (rc == 0) /* same file, done! */
2131 if (rc < 0) { /* sequentialize it */
2132 swap(llss->inode1, llss->inode2);
2134 swap(llss->dv1, llss->dv2);
2135 swap(llss->check_dv1, llss->check_dv2);
2139 if (gid != 0) { /* application asks to flush dirty cache */
2140 rc = ll_get_grouplock(llss->inode1, file1, gid);
2144 rc = ll_get_grouplock(llss->inode2, file2, gid);
2146 ll_put_grouplock(llss->inode1, file1, gid);
2151 /* to be able to restore mtime and atime after swap
2152 * we need to first save them */
2154 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2155 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2156 llss->ia1.ia_atime = llss->inode1->i_atime;
2157 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2158 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2159 llss->ia2.ia_atime = llss->inode2->i_atime;
2160 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2163 /* ultimate check, before swaping the layouts we check if
2164 * dataversion has changed (if requested) */
2165 if (llss->check_dv1) {
2166 rc = ll_data_version(llss->inode1, &dv, 0);
2169 if (dv != llss->dv1)
2170 GOTO(putgl, rc = -EAGAIN);
2173 if (llss->check_dv2) {
2174 rc = ll_data_version(llss->inode2, &dv, 0);
2177 if (dv != llss->dv2)
2178 GOTO(putgl, rc = -EAGAIN);
2181 /* struct md_op_data is used to send the swap args to the mdt
2182 * only flags is missing, so we use struct mdc_swap_layouts
2183 * through the md_op_data->op_data */
2184 /* flags from user space have to be converted before they are send to
2185 * server, no flag is sent today, they are only used on the client */
2188 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2189 0, LUSTRE_OPC_ANY, &msl);
2190 if (IS_ERR(op_data))
2191 GOTO(free, rc = PTR_ERR(op_data));
2193 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2194 sizeof(*op_data), op_data, NULL);
2195 ll_finish_md_op_data(op_data);
2199 ll_put_grouplock(llss->inode2, file2, gid);
2200 ll_put_grouplock(llss->inode1, file1, gid);
2203 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2207 /* clear useless flags */
2208 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2209 llss->ia1.ia_valid &= ~ATTR_MTIME;
2210 llss->ia2.ia_valid &= ~ATTR_MTIME;
2213 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2214 llss->ia1.ia_valid &= ~ATTR_ATIME;
2215 llss->ia2.ia_valid &= ~ATTR_ATIME;
2218 /* update time if requested */
2220 if (llss->ia2.ia_valid != 0) {
2221 mutex_lock(&llss->inode1->i_mutex);
2222 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2223 mutex_unlock(&llss->inode1->i_mutex);
2226 if (llss->ia1.ia_valid != 0) {
2229 mutex_lock(&llss->inode2->i_mutex);
2230 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2231 mutex_unlock(&llss->inode2->i_mutex);
2243 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2245 struct md_op_data *op_data;
2248 /* Non-root users are forbidden to set or clear flags which are
2249 * NOT defined in HSM_USER_MASK. */
2250 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2251 !cfs_capable(CFS_CAP_SYS_ADMIN))
2254 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2255 LUSTRE_OPC_ANY, hss);
2256 if (IS_ERR(op_data))
2257 RETURN(PTR_ERR(op_data));
2259 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2260 sizeof(*op_data), op_data, NULL);
2262 ll_finish_md_op_data(op_data);
2267 static int ll_hsm_import(struct inode *inode, struct file *file,
2268 struct hsm_user_import *hui)
2270 struct hsm_state_set *hss = NULL;
2271 struct iattr *attr = NULL;
2275 if (!S_ISREG(inode->i_mode))
2281 GOTO(out, rc = -ENOMEM);
2283 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2284 hss->hss_archive_id = hui->hui_archive_id;
2285 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2286 rc = ll_hsm_state_set(inode, hss);
2290 OBD_ALLOC_PTR(attr);
2292 GOTO(out, rc = -ENOMEM);
2294 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2295 attr->ia_mode |= S_IFREG;
2296 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2297 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2298 attr->ia_size = hui->hui_size;
2299 attr->ia_mtime.tv_sec = hui->hui_mtime;
2300 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2301 attr->ia_atime.tv_sec = hui->hui_atime;
2302 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2304 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2305 ATTR_UID | ATTR_GID |
2306 ATTR_MTIME | ATTR_MTIME_SET |
2307 ATTR_ATIME | ATTR_ATIME_SET;
2309 rc = ll_setattr_raw(file->f_dentry, attr, true);
2324 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2326 struct inode *inode = file->f_dentry->d_inode;
2327 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2332 PFID(ll_inode2fid(inode)), inode, cmd);
2333 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2335 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2336 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2340 case LL_IOC_GETFLAGS:
2341 /* Get the current value of the file flags */
2342 return put_user(fd->fd_flags, (int *)arg);
2343 case LL_IOC_SETFLAGS:
2344 case LL_IOC_CLRFLAGS:
2345 /* Set or clear specific file flags */
2346 /* XXX This probably needs checks to ensure the flags are
2347 * not abused, and to handle any flag side effects.
2349 if (get_user(flags, (int *) arg))
2352 if (cmd == LL_IOC_SETFLAGS) {
2353 if ((flags & LL_FILE_IGNORE_LOCK) &&
2354 !(file->f_flags & O_DIRECT)) {
2355 CERROR("%s: unable to disable locking on "
2356 "non-O_DIRECT file\n", current->comm);
2360 fd->fd_flags |= flags;
2362 fd->fd_flags &= ~flags;
2365 case LL_IOC_LOV_SETSTRIPE:
2366 RETURN(ll_lov_setstripe(inode, file, arg));
2367 case LL_IOC_LOV_SETEA:
2368 RETURN(ll_lov_setea(inode, file, arg));
2369 case LL_IOC_LOV_SWAP_LAYOUTS: {
2371 struct lustre_swap_layouts lsl;
2373 if (copy_from_user(&lsl, (char *)arg,
2374 sizeof(struct lustre_swap_layouts)))
2377 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2380 file2 = fget(lsl.sl_fd);
2385 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2386 rc = ll_swap_layouts(file, file2, &lsl);
2390 case LL_IOC_LOV_GETSTRIPE:
2391 RETURN(ll_lov_getstripe(inode, arg));
2392 case LL_IOC_RECREATE_OBJ:
2393 RETURN(ll_lov_recreate_obj(inode, arg));
2394 case LL_IOC_RECREATE_FID:
2395 RETURN(ll_lov_recreate_fid(inode, arg));
2396 case FSFILT_IOC_FIEMAP:
2397 RETURN(ll_ioctl_fiemap(inode, arg));
2398 case FSFILT_IOC_GETFLAGS:
2399 case FSFILT_IOC_SETFLAGS:
2400 RETURN(ll_iocontrol(inode, file, cmd, arg));
2401 case FSFILT_IOC_GETVERSION_OLD:
2402 case FSFILT_IOC_GETVERSION:
2403 RETURN(put_user(inode->i_generation, (int *)arg));
2404 case LL_IOC_GROUP_LOCK:
2405 RETURN(ll_get_grouplock(inode, file, arg));
2406 case LL_IOC_GROUP_UNLOCK:
2407 RETURN(ll_put_grouplock(inode, file, arg));
2408 case IOC_OBD_STATFS:
2409 RETURN(ll_obd_statfs(inode, (void *)arg));
2411 /* We need to special case any other ioctls we want to handle,
2412 * to send them to the MDS/OST as appropriate and to properly
2413 * network encode the arg field.
2414 case FSFILT_IOC_SETVERSION_OLD:
2415 case FSFILT_IOC_SETVERSION:
2417 case LL_IOC_FLUSHCTX:
2418 RETURN(ll_flush_ctx(inode));
2419 case LL_IOC_PATH2FID: {
2420 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2421 sizeof(struct lu_fid)))
2426 case OBD_IOC_FID2PATH:
2427 RETURN(ll_fid2path(inode, (void *)arg));
2428 case LL_IOC_DATA_VERSION: {
2429 struct ioc_data_version idv;
2432 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2435 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2436 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2438 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2444 case LL_IOC_GET_MDTIDX: {
2447 mdtidx = ll_get_mdt_idx(inode);
2451 if (put_user((int)mdtidx, (int*)arg))
2456 case OBD_IOC_GETDTNAME:
2457 case OBD_IOC_GETMDNAME:
2458 RETURN(ll_get_obd_name(inode, cmd, arg));
2459 case LL_IOC_HSM_STATE_GET: {
2460 struct md_op_data *op_data;
2461 struct hsm_user_state *hus;
2468 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2469 LUSTRE_OPC_ANY, hus);
2470 if (IS_ERR(op_data)) {
2472 RETURN(PTR_ERR(op_data));
2475 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2478 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2481 ll_finish_md_op_data(op_data);
2485 case LL_IOC_HSM_STATE_SET: {
2486 struct hsm_state_set *hss;
2493 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2498 rc = ll_hsm_state_set(inode, hss);
2503 case LL_IOC_HSM_ACTION: {
2504 struct md_op_data *op_data;
2505 struct hsm_current_action *hca;
2512 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2513 LUSTRE_OPC_ANY, hca);
2514 if (IS_ERR(op_data)) {
2516 RETURN(PTR_ERR(op_data));
2519 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2522 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2525 ll_finish_md_op_data(op_data);
2529 case LL_IOC_SET_LEASE: {
2530 struct ll_inode_info *lli = ll_i2info(inode);
2531 struct obd_client_handle *och = NULL;
2537 if (!(file->f_mode & FMODE_WRITE))
2542 if (!(file->f_mode & FMODE_READ))
2547 mutex_lock(&lli->lli_och_mutex);
2548 if (fd->fd_lease_och != NULL) {
2549 och = fd->fd_lease_och;
2550 fd->fd_lease_och = NULL;
2552 mutex_unlock(&lli->lli_och_mutex);
2555 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2556 rc = ll_lease_close(och, inode, &lease_broken);
2557 if (rc == 0 && lease_broken)
2563 /* return the type of lease or error */
2564 RETURN(rc < 0 ? rc : (int)mode);
2569 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2571 /* apply for lease */
2572 och = ll_lease_open(inode, file, mode, 0);
2574 RETURN(PTR_ERR(och));
2577 mutex_lock(&lli->lli_och_mutex);
2578 if (fd->fd_lease_och == NULL) {
2579 fd->fd_lease_och = och;
2582 mutex_unlock(&lli->lli_och_mutex);
2584 /* impossible now that only excl is supported for now */
2585 ll_lease_close(och, inode, &lease_broken);
2590 case LL_IOC_GET_LEASE: {
2591 struct ll_inode_info *lli = ll_i2info(inode);
2592 struct ldlm_lock *lock = NULL;
2595 mutex_lock(&lli->lli_och_mutex);
2596 if (fd->fd_lease_och != NULL) {
2597 struct obd_client_handle *och = fd->fd_lease_och;
2599 lock = ldlm_handle2lock(&och->och_lease_handle);
2601 lock_res_and_lock(lock);
2602 if (!ldlm_is_cancel(lock))
2603 rc = och->och_flags &
2604 (FMODE_READ | FMODE_WRITE);
2605 unlock_res_and_lock(lock);
2606 LDLM_LOCK_PUT(lock);
2609 mutex_unlock(&lli->lli_och_mutex);
2612 case LL_IOC_HSM_IMPORT: {
2613 struct hsm_user_import *hui;
2619 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2624 rc = ll_hsm_import(inode, file, hui);
2634 ll_iocontrol_call(inode, file, cmd, arg, &err))
2637 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2643 #ifndef HAVE_FILE_LLSEEK_SIZE
2644 static inline loff_t
2645 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2647 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2649 if (offset > maxsize)
2652 if (offset != file->f_pos) {
2653 file->f_pos = offset;
2654 file->f_version = 0;
2660 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2661 loff_t maxsize, loff_t eof)
2663 struct inode *inode = file->f_dentry->d_inode;
2671 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2672 * position-querying operation. Avoid rewriting the "same"
2673 * f_pos value back to the file because a concurrent read(),
2674 * write() or lseek() might have altered it
2679 * f_lock protects against read/modify/write race with other
2680 * SEEK_CURs. Note that parallel writes and reads behave
2683 mutex_lock(&inode->i_mutex);
2684 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2685 mutex_unlock(&inode->i_mutex);
2689 * In the generic case the entire file is data, so as long as
2690 * offset isn't at the end of the file then the offset is data.
2697 * There is a virtual hole at the end of the file, so as long as
2698 * offset isn't i_size or larger, return i_size.
2706 return llseek_execute(file, offset, maxsize);
2710 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2712 struct inode *inode = file->f_dentry->d_inode;
2713 loff_t retval, eof = 0;
2716 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2717 (origin == SEEK_CUR) ? file->f_pos : 0);
2718 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2719 PFID(ll_inode2fid(inode)), inode, retval, retval,
2721 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2723 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2724 retval = ll_glimpse_size(inode);
2727 eof = i_size_read(inode);
2730 retval = ll_generic_file_llseek_size(file, offset, origin,
2731 ll_file_maxbytes(inode), eof);
2735 static int ll_flush(struct file *file, fl_owner_t id)
2737 struct inode *inode = file->f_dentry->d_inode;
2738 struct ll_inode_info *lli = ll_i2info(inode);
2739 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2742 LASSERT(!S_ISDIR(inode->i_mode));
2744 /* catch async errors that were recorded back when async writeback
2745 * failed for pages in this mapping. */
2746 rc = lli->lli_async_rc;
2747 lli->lli_async_rc = 0;
2748 if (lli->lli_clob != NULL) {
2749 err = lov_read_and_clear_async_rc(lli->lli_clob);
2754 /* The application has been told write failure already.
2755 * Do not report failure again. */
2756 if (fd->fd_write_failed)
2758 return rc ? -EIO : 0;
2762 * Called to make sure a portion of file has been written out.
2763 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2765 * Return how many pages have been written.
2767 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2768 enum cl_fsync_mode mode, int ignore_layout)
2770 struct cl_env_nest nest;
2773 struct obd_capa *capa = NULL;
2774 struct cl_fsync_io *fio;
2778 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2779 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2782 env = cl_env_nested_get(&nest);
2784 RETURN(PTR_ERR(env));
2786 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2788 io = ccc_env_thread_io(env);
2789 io->ci_obj = cl_i2info(inode)->lli_clob;
2790 io->ci_ignore_layout = ignore_layout;
2792 /* initialize parameters for sync */
2793 fio = &io->u.ci_fsync;
2794 fio->fi_capa = capa;
2795 fio->fi_start = start;
2797 fio->fi_fid = ll_inode2fid(inode);
2798 fio->fi_mode = mode;
2799 fio->fi_nr_written = 0;
2801 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2802 result = cl_io_loop(env, io);
2804 result = io->ci_result;
2806 result = fio->fi_nr_written;
2807 cl_io_fini(env, io);
2808 cl_env_nested_put(&nest, env);
2816 * When dentry is provided (the 'else' case), *file->f_dentry may be
2817 * null and dentry must be used directly rather than pulled from
2818 * *file->f_dentry as is done otherwise.
2821 #ifdef HAVE_FILE_FSYNC_4ARGS
2822 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2824 struct dentry *dentry = file->f_dentry;
2825 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2826 int ll_fsync(struct file *file, int datasync)
2828 struct dentry *dentry = file->f_dentry;
2830 loff_t end = LLONG_MAX;
2832 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2835 loff_t end = LLONG_MAX;
2837 struct inode *inode = dentry->d_inode;
2838 struct ll_inode_info *lli = ll_i2info(inode);
2839 struct ptlrpc_request *req;
2840 struct obd_capa *oc;
2844 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2845 PFID(ll_inode2fid(inode)), inode);
2846 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2848 #ifdef HAVE_FILE_FSYNC_4ARGS
2849 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2850 mutex_lock(&inode->i_mutex);
2852 /* fsync's caller has already called _fdata{sync,write}, we want
2853 * that IO to finish before calling the osc and mdc sync methods */
2854 rc = filemap_fdatawait(inode->i_mapping);
2857 /* catch async errors that were recorded back when async writeback
2858 * failed for pages in this mapping. */
2859 if (!S_ISDIR(inode->i_mode)) {
2860 err = lli->lli_async_rc;
2861 lli->lli_async_rc = 0;
2864 err = lov_read_and_clear_async_rc(lli->lli_clob);
2869 oc = ll_mdscapa_get(inode);
2870 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2876 ptlrpc_req_finished(req);
2878 if (S_ISREG(inode->i_mode)) {
2879 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2881 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2882 if (rc == 0 && err < 0)
2885 fd->fd_write_failed = true;
2887 fd->fd_write_failed = false;
2890 #ifdef HAVE_FILE_FSYNC_4ARGS
2891 mutex_unlock(&inode->i_mutex);
2897 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2899 struct inode *inode = file->f_dentry->d_inode;
2900 struct ll_sb_info *sbi = ll_i2sbi(inode);
2901 struct ldlm_enqueue_info einfo = {
2902 .ei_type = LDLM_FLOCK,
2903 .ei_cb_cp = ldlm_flock_completion_ast,
2904 .ei_cbdata = file_lock,
2906 struct md_op_data *op_data;
2907 struct lustre_handle lockh = {0};
2908 ldlm_policy_data_t flock = {{0}};
2909 int fl_type = file_lock->fl_type;
2915 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2916 PFID(ll_inode2fid(inode)), file_lock);
2918 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2920 if (file_lock->fl_flags & FL_FLOCK) {
2921 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2922 /* flocks are whole-file locks */
2923 flock.l_flock.end = OFFSET_MAX;
2924 /* For flocks owner is determined by the local file desctiptor*/
2925 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2926 } else if (file_lock->fl_flags & FL_POSIX) {
2927 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2928 flock.l_flock.start = file_lock->fl_start;
2929 flock.l_flock.end = file_lock->fl_end;
2933 flock.l_flock.pid = file_lock->fl_pid;
2935 /* Somewhat ugly workaround for svc lockd.
2936 * lockd installs custom fl_lmops->lm_compare_owner that checks
2937 * for the fl_owner to be the same (which it always is on local node
2938 * I guess between lockd processes) and then compares pid.
2939 * As such we assign pid to the owner field to make it all work,
2940 * conflict with normal locks is unlikely since pid space and
2941 * pointer space for current->files are not intersecting */
2942 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2943 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2947 einfo.ei_mode = LCK_PR;
2950 /* An unlock request may or may not have any relation to
2951 * existing locks so we may not be able to pass a lock handle
2952 * via a normal ldlm_lock_cancel() request. The request may even
2953 * unlock a byte range in the middle of an existing lock. In
2954 * order to process an unlock request we need all of the same
2955 * information that is given with a normal read or write record
2956 * lock request. To avoid creating another ldlm unlock (cancel)
2957 * message we'll treat a LCK_NL flock request as an unlock. */
2958 einfo.ei_mode = LCK_NL;
2961 einfo.ei_mode = LCK_PW;
2964 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2979 flags = LDLM_FL_BLOCK_NOWAIT;
2985 flags = LDLM_FL_TEST_LOCK;
2988 CERROR("unknown fcntl lock command: %d\n", cmd);
2992 /* Save the old mode so that if the mode in the lock changes we
2993 * can decrement the appropriate reader or writer refcount. */
2994 file_lock->fl_type = einfo.ei_mode;
2996 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2997 LUSTRE_OPC_ANY, NULL);
2998 if (IS_ERR(op_data))
2999 RETURN(PTR_ERR(op_data));
3001 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3002 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3003 flock.l_flock.pid, flags, einfo.ei_mode,
3004 flock.l_flock.start, flock.l_flock.end);
3006 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3009 /* Restore the file lock type if not TEST lock. */
3010 if (!(flags & LDLM_FL_TEST_LOCK))
3011 file_lock->fl_type = fl_type;
3013 if ((file_lock->fl_flags & FL_FLOCK) &&
3014 (rc == 0 || file_lock->fl_type == F_UNLCK))
3015 rc2 = flock_lock_file_wait(file, file_lock);
3016 if ((file_lock->fl_flags & FL_POSIX) &&
3017 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3018 !(flags & LDLM_FL_TEST_LOCK))
3019 rc2 = posix_lock_file_wait(file, file_lock);
3021 if (rc2 && file_lock->fl_type != F_UNLCK) {
3022 einfo.ei_mode = LCK_NL;
3023 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3028 ll_finish_md_op_data(op_data);
3033 int ll_get_fid_by_name(struct inode *parent, const char *name,
3034 int namelen, struct lu_fid *fid)
3036 struct md_op_data *op_data = NULL;
3037 struct mdt_body *body;
3038 struct ptlrpc_request *req;
3042 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3043 LUSTRE_OPC_ANY, NULL);
3044 if (IS_ERR(op_data))
3045 RETURN(PTR_ERR(op_data));
3047 op_data->op_valid = OBD_MD_FLID;
3048 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3049 ll_finish_md_op_data(op_data);
3053 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3055 GOTO(out_req, rc = -EFAULT);
3057 *fid = body->mbo_fid1;
3059 ptlrpc_req_finished(req);
3063 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3064 const char *name, int namelen)
3066 struct dentry *dchild = NULL;
3067 struct inode *child_inode = NULL;
3068 struct md_op_data *op_data;
3069 struct ptlrpc_request *request = NULL;
3074 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3075 name, PFID(ll_inode2fid(parent)), mdtidx);
3077 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3078 0, LUSTRE_OPC_ANY, NULL);
3079 if (IS_ERR(op_data))
3080 RETURN(PTR_ERR(op_data));
3082 /* Get child FID first */
3083 qstr.hash = full_name_hash(name, namelen);
3086 dchild = d_lookup(file->f_dentry, &qstr);
3087 if (dchild != NULL && dchild->d_inode != NULL) {
3088 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3089 if (dchild->d_inode != NULL) {
3090 child_inode = igrab(dchild->d_inode);
3091 ll_invalidate_aliases(child_inode);
3095 rc = ll_get_fid_by_name(parent, name, namelen,
3101 if (!fid_is_sane(&op_data->op_fid3)) {
3102 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3103 ll_get_fsname(parent->i_sb, NULL, 0), name,
3104 PFID(&op_data->op_fid3));
3105 GOTO(out_free, rc = -EINVAL);
3108 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3113 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3114 PFID(&op_data->op_fid3), mdtidx);
3115 GOTO(out_free, rc = 0);
3118 op_data->op_mds = mdtidx;
3119 op_data->op_cli_flags = CLI_MIGRATE;
3120 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3121 namelen, name, namelen, &request);
3123 ll_update_times(request, parent);
3125 ptlrpc_req_finished(request);
3130 if (child_inode != NULL) {
3131 clear_nlink(child_inode);
3135 ll_finish_md_op_data(op_data);
3140 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3148 * test if some locks matching bits and l_req_mode are acquired
3149 * - bits can be in different locks
3150 * - if found clear the common lock bits in *bits
3151 * - the bits not found, are kept in *bits
3153 * \param bits [IN] searched lock bits [IN]
3154 * \param l_req_mode [IN] searched lock mode
3155 * \retval boolean, true iff all bits are found
3157 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3159 struct lustre_handle lockh;
3160 ldlm_policy_data_t policy;
3161 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3162 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3171 fid = &ll_i2info(inode)->lli_fid;
3172 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3173 ldlm_lockname[mode]);
3175 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3176 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3177 policy.l_inodebits.bits = *bits & (1 << i);
3178 if (policy.l_inodebits.bits == 0)
3181 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3182 &policy, mode, &lockh)) {
3183 struct ldlm_lock *lock;
3185 lock = ldlm_handle2lock(&lockh);
3188 ~(lock->l_policy_data.l_inodebits.bits);
3189 LDLM_LOCK_PUT(lock);
3191 *bits &= ~policy.l_inodebits.bits;
3198 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3199 struct lustre_handle *lockh, __u64 flags,
3202 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3207 fid = &ll_i2info(inode)->lli_fid;
3208 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3210 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3211 fid, LDLM_IBITS, &policy, mode, lockh);
3216 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3218 /* Already unlinked. Just update nlink and return success */
3219 if (rc == -ENOENT) {
3221 /* This path cannot be hit for regular files unless in
3222 * case of obscure races, so no need to to validate
3224 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3226 } else if (rc != 0) {
3227 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3228 "%s: revalidate FID "DFID" error: rc = %d\n",
3229 ll_get_fsname(inode->i_sb, NULL, 0),
3230 PFID(ll_inode2fid(inode)), rc);
3236 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3238 struct inode *inode = dentry->d_inode;
3239 struct ptlrpc_request *req = NULL;
3240 struct obd_export *exp;
3244 LASSERT(inode != NULL);
3246 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3247 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3249 exp = ll_i2mdexp(inode);
3251 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3252 * But under CMD case, it caused some lock issues, should be fixed
3253 * with new CMD ibits lock. See bug 12718 */
3254 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3255 struct lookup_intent oit = { .it_op = IT_GETATTR };
3256 struct md_op_data *op_data;
3258 if (ibits == MDS_INODELOCK_LOOKUP)
3259 oit.it_op = IT_LOOKUP;
3261 /* Call getattr by fid, so do not provide name at all. */
3262 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3263 dentry->d_inode, NULL, 0, 0,
3264 LUSTRE_OPC_ANY, NULL);
3265 if (IS_ERR(op_data))
3266 RETURN(PTR_ERR(op_data));
3268 rc = md_intent_lock(exp, op_data, &oit, &req,
3269 &ll_md_blocking_ast, 0);
3270 ll_finish_md_op_data(op_data);
3272 rc = ll_inode_revalidate_fini(inode, rc);
3276 rc = ll_revalidate_it_finish(req, &oit, dentry);
3278 ll_intent_release(&oit);
3282 /* Unlinked? Unhash dentry, so it is not picked up later by
3283 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3284 here to preserve get_cwd functionality on 2.6.
3286 if (!dentry->d_inode->i_nlink)
3287 d_lustre_invalidate(dentry, 0);
3289 ll_lookup_finish_locks(&oit, dentry);
3290 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3291 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3292 obd_valid valid = OBD_MD_FLGETATTR;
3293 struct md_op_data *op_data;
3296 if (S_ISREG(inode->i_mode)) {
3297 rc = ll_get_default_mdsize(sbi, &ealen);
3300 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3303 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3304 0, ealen, LUSTRE_OPC_ANY,
3306 if (IS_ERR(op_data))
3307 RETURN(PTR_ERR(op_data));
3309 op_data->op_valid = valid;
3310 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3311 * capa for this inode. Because we only keep capas of dirs
3313 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3314 ll_finish_md_op_data(op_data);
3316 rc = ll_inode_revalidate_fini(inode, rc);
3320 rc = ll_prep_inode(&inode, req, NULL, NULL);
3323 ptlrpc_req_finished(req);
3327 static int ll_merge_md_attr(struct inode *inode)
3329 struct cl_attr attr = { 0 };
3332 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3333 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3338 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3339 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3341 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3342 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3343 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3349 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3351 struct inode *inode = dentry->d_inode;
3355 rc = __ll_inode_revalidate(dentry, ibits);
3359 /* if object isn't regular file, don't validate size */
3360 if (!S_ISREG(inode->i_mode)) {
3361 if (S_ISDIR(inode->i_mode) &&
3362 ll_i2info(inode)->lli_lsm_md != NULL) {
3363 rc = ll_merge_md_attr(inode);
3368 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3369 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3370 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3372 /* In case of restore, the MDT has the right size and has
3373 * already send it back without granting the layout lock,
3374 * inode is up-to-date so glimpse is useless.
3375 * Also to glimpse we need the layout, in case of a running
3376 * restore the MDT holds the layout lock so the glimpse will
3377 * block up to the end of restore (getattr will block)
3379 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3380 rc = ll_glimpse_size(inode);
3385 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3387 struct inode *inode = de->d_inode;
3388 struct ll_sb_info *sbi = ll_i2sbi(inode);
3389 struct ll_inode_info *lli = ll_i2info(inode);
3392 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3393 MDS_INODELOCK_LOOKUP);
3394 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3399 stat->dev = inode->i_sb->s_dev;
3400 if (ll_need_32bit_api(sbi))
3401 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3403 stat->ino = inode->i_ino;
3404 stat->mode = inode->i_mode;
3405 stat->uid = inode->i_uid;
3406 stat->gid = inode->i_gid;
3407 stat->rdev = inode->i_rdev;
3408 stat->atime = inode->i_atime;
3409 stat->mtime = inode->i_mtime;
3410 stat->ctime = inode->i_ctime;
3411 stat->blksize = 1 << inode->i_blkbits;
3412 stat->blocks = inode->i_blocks;
3414 if (S_ISDIR(inode->i_mode) &&
3415 ll_i2info(inode)->lli_lsm_md != NULL) {
3416 stat->nlink = lli->lli_stripe_dir_nlink;
3417 stat->size = lli->lli_stripe_dir_size;
3419 stat->nlink = inode->i_nlink;
3420 stat->size = i_size_read(inode);
3426 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3427 __u64 start, __u64 len)
3431 struct ll_user_fiemap *fiemap;
3432 unsigned int extent_count = fieinfo->fi_extents_max;
3434 num_bytes = sizeof(*fiemap) + (extent_count *
3435 sizeof(struct ll_fiemap_extent));
3436 OBD_ALLOC_LARGE(fiemap, num_bytes);
3441 fiemap->fm_flags = fieinfo->fi_flags;
3442 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3443 fiemap->fm_start = start;
3444 fiemap->fm_length = len;
3445 if (extent_count > 0)
3446 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3447 sizeof(struct ll_fiemap_extent));
3449 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3451 fieinfo->fi_flags = fiemap->fm_flags;
3452 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3453 if (extent_count > 0)
3454 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3455 fiemap->fm_mapped_extents *
3456 sizeof(struct ll_fiemap_extent));
3458 OBD_FREE_LARGE(fiemap, num_bytes);
3462 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3464 struct ll_inode_info *lli = ll_i2info(inode);
3465 struct posix_acl *acl = NULL;
3468 spin_lock(&lli->lli_lock);
3469 /* VFS' acl_permission_check->check_acl will release the refcount */
3470 acl = posix_acl_dup(lli->lli_posix_acl);
3471 spin_unlock(&lli->lli_lock);
3476 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3478 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3479 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3481 ll_check_acl(struct inode *inode, int mask)
3484 # ifdef CONFIG_FS_POSIX_ACL
3485 struct posix_acl *acl;
3489 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3490 if (flags & IPERM_FLAG_RCU)
3493 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3498 rc = posix_acl_permission(inode, acl, mask);
3499 posix_acl_release(acl);
3502 # else /* !CONFIG_FS_POSIX_ACL */
3504 # endif /* CONFIG_FS_POSIX_ACL */
3506 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3508 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3509 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3511 # ifdef HAVE_INODE_PERMISION_2ARGS
3512 int ll_inode_permission(struct inode *inode, int mask)
3514 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3519 struct ll_sb_info *sbi;
3520 struct root_squash_info *squash;
3521 struct cred *cred = NULL;
3522 const struct cred *old_cred = NULL;
3524 bool squash_id = false;
3527 #ifdef MAY_NOT_BLOCK
3528 if (mask & MAY_NOT_BLOCK)
3530 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3531 if (flags & IPERM_FLAG_RCU)
3535 /* as root inode are NOT getting validated in lookup operation,
3536 * need to do it before permission check. */
3538 if (inode == inode->i_sb->s_root->d_inode) {
3539 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3540 MDS_INODELOCK_LOOKUP);
3545 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3546 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3548 /* squash fsuid/fsgid if needed */
3549 sbi = ll_i2sbi(inode);
3550 squash = &sbi->ll_squash;
3551 if (unlikely(squash->rsi_uid != 0 &&
3552 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3553 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3557 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3558 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3559 squash->rsi_uid, squash->rsi_gid);
3561 /* update current process's credentials
3562 * and FS capability */
3563 cred = prepare_creds();
3567 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3568 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3569 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3570 if ((1 << cap) & CFS_CAP_FS_MASK)
3571 cap_lower(cred->cap_effective, cap);
3573 old_cred = override_creds(cred);
3576 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3578 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3579 rc = lustre_check_remote_perm(inode, mask);
3581 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3583 /* restore current process's credentials and FS capability */
3585 revert_creds(old_cred);
3592 /* -o localflock - only provides locally consistent flock locks */
3593 struct file_operations ll_file_operations = {
3594 .read = ll_file_read,
3595 .aio_read = ll_file_aio_read,
3596 .write = ll_file_write,
3597 .aio_write = ll_file_aio_write,
3598 .unlocked_ioctl = ll_file_ioctl,
3599 .open = ll_file_open,
3600 .release = ll_file_release,
3601 .mmap = ll_file_mmap,
3602 .llseek = ll_file_seek,
3603 .splice_read = ll_file_splice_read,
3608 struct file_operations ll_file_operations_flock = {
3609 .read = ll_file_read,
3610 .aio_read = ll_file_aio_read,
3611 .write = ll_file_write,
3612 .aio_write = ll_file_aio_write,
3613 .unlocked_ioctl = ll_file_ioctl,
3614 .open = ll_file_open,
3615 .release = ll_file_release,
3616 .mmap = ll_file_mmap,
3617 .llseek = ll_file_seek,
3618 .splice_read = ll_file_splice_read,
3621 .flock = ll_file_flock,
3622 .lock = ll_file_flock
3625 /* These are for -o noflock - to return ENOSYS on flock calls */
3626 struct file_operations ll_file_operations_noflock = {
3627 .read = ll_file_read,
3628 .aio_read = ll_file_aio_read,
3629 .write = ll_file_write,
3630 .aio_write = ll_file_aio_write,
3631 .unlocked_ioctl = ll_file_ioctl,
3632 .open = ll_file_open,
3633 .release = ll_file_release,
3634 .mmap = ll_file_mmap,
3635 .llseek = ll_file_seek,
3636 .splice_read = ll_file_splice_read,
3639 .flock = ll_file_noflock,
3640 .lock = ll_file_noflock
3643 struct inode_operations ll_file_inode_operations = {
3644 .setattr = ll_setattr,
3645 .getattr = ll_getattr,
3646 .permission = ll_inode_permission,
3647 .setxattr = ll_setxattr,
3648 .getxattr = ll_getxattr,
3649 .listxattr = ll_listxattr,
3650 .removexattr = ll_removexattr,
3651 .fiemap = ll_fiemap,
3652 #ifdef HAVE_IOP_GET_ACL
3653 .get_acl = ll_get_acl,
3657 /* dynamic ioctl number support routins */
3658 static struct llioc_ctl_data {
3659 struct rw_semaphore ioc_sem;
3660 struct list_head ioc_head;
3662 __RWSEM_INITIALIZER(llioc.ioc_sem),
3663 LIST_HEAD_INIT(llioc.ioc_head)
3668 struct list_head iocd_list;
3669 unsigned int iocd_size;
3670 llioc_callback_t iocd_cb;
3671 unsigned int iocd_count;
3672 unsigned int iocd_cmd[0];
3675 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3678 struct llioc_data *in_data = NULL;
3681 if (cb == NULL || cmd == NULL ||
3682 count > LLIOC_MAX_CMD || count < 0)
3685 size = sizeof(*in_data) + count * sizeof(unsigned int);
3686 OBD_ALLOC(in_data, size);
3687 if (in_data == NULL)
3690 memset(in_data, 0, sizeof(*in_data));
3691 in_data->iocd_size = size;
3692 in_data->iocd_cb = cb;
3693 in_data->iocd_count = count;
3694 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3696 down_write(&llioc.ioc_sem);
3697 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3698 up_write(&llioc.ioc_sem);
3703 void ll_iocontrol_unregister(void *magic)
3705 struct llioc_data *tmp;
3710 down_write(&llioc.ioc_sem);
3711 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3713 unsigned int size = tmp->iocd_size;
3715 list_del(&tmp->iocd_list);
3716 up_write(&llioc.ioc_sem);
3718 OBD_FREE(tmp, size);
3722 up_write(&llioc.ioc_sem);
3724 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3727 EXPORT_SYMBOL(ll_iocontrol_register);
3728 EXPORT_SYMBOL(ll_iocontrol_unregister);
3730 static enum llioc_iter
3731 ll_iocontrol_call(struct inode *inode, struct file *file,
3732 unsigned int cmd, unsigned long arg, int *rcp)
3734 enum llioc_iter ret = LLIOC_CONT;
3735 struct llioc_data *data;
3736 int rc = -EINVAL, i;
3738 down_read(&llioc.ioc_sem);
3739 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3740 for (i = 0; i < data->iocd_count; i++) {
3741 if (cmd != data->iocd_cmd[i])
3744 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3748 if (ret == LLIOC_STOP)
3751 up_read(&llioc.ioc_sem);
3758 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3760 struct ll_inode_info *lli = ll_i2info(inode);
3761 struct cl_env_nest nest;
3766 if (lli->lli_clob == NULL)
3769 env = cl_env_nested_get(&nest);
3771 RETURN(PTR_ERR(env));
3773 result = cl_conf_set(env, lli->lli_clob, conf);
3774 cl_env_nested_put(&nest, env);
3776 if (conf->coc_opc == OBJECT_CONF_SET) {
3777 struct ldlm_lock *lock = conf->coc_lock;
3779 LASSERT(lock != NULL);
3780 LASSERT(ldlm_has_layout(lock));
3782 struct lustre_md *md = conf->u.coc_md;
3783 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3785 /* it can only be allowed to match after layout is
3786 * applied to inode otherwise false layout would be
3787 * seen. Applying layout shoud happen before dropping
3788 * the intent lock. */
3789 ldlm_lock_allow_match(lock);
3791 lli->lli_has_smd = lsm_has_objects(md->lsm);
3792 if (md->lsm != NULL)
3793 gen = md->lsm->lsm_layout_gen;
3796 DFID ": layout version change: %u -> %u\n",
3797 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3799 ll_layout_version_set(lli, gen);
3805 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3806 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3809 struct ll_sb_info *sbi = ll_i2sbi(inode);
3810 struct obd_capa *oc;
3811 struct ptlrpc_request *req;
3812 struct mdt_body *body;
3819 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3820 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3821 lock->l_lvb_data, lock->l_lvb_len);
3823 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3826 /* if layout lock was granted right away, the layout is returned
3827 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3828 * blocked and then granted via completion ast, we have to fetch
3829 * layout here. Please note that we can't use the LVB buffer in
3830 * completion AST because it doesn't have a large enough buffer */
3831 oc = ll_mdscapa_get(inode);
3832 rc = ll_get_default_mdsize(sbi, &lmmsize);
3834 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3835 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3841 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3843 GOTO(out, rc = -EPROTO);
3845 lmmsize = body->mbo_eadatasize;
3846 if (lmmsize == 0) /* empty layout */
3849 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3851 GOTO(out, rc = -EFAULT);
3853 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3854 if (lvbdata == NULL)
3855 GOTO(out, rc = -ENOMEM);
3857 memcpy(lvbdata, lmm, lmmsize);
3858 lock_res_and_lock(lock);
3859 if (lock->l_lvb_data != NULL)
3860 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3862 lock->l_lvb_data = lvbdata;
3863 lock->l_lvb_len = lmmsize;
3864 unlock_res_and_lock(lock);
3869 ptlrpc_req_finished(req);
3874 * Apply the layout to the inode. Layout lock is held and will be released
3877 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3878 struct inode *inode, __u32 *gen, bool reconf)
3880 struct ll_inode_info *lli = ll_i2info(inode);
3881 struct ll_sb_info *sbi = ll_i2sbi(inode);
3882 struct ldlm_lock *lock;
3883 struct lustre_md md = { NULL };
3884 struct cl_object_conf conf;
3887 bool wait_layout = false;
3890 LASSERT(lustre_handle_is_used(lockh));
3892 lock = ldlm_handle2lock(lockh);
3893 LASSERT(lock != NULL);
3894 LASSERT(ldlm_has_layout(lock));
3896 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
3897 PFID(&lli->lli_fid), inode, reconf);
3899 /* in case this is a caching lock and reinstate with new inode */
3900 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3902 lock_res_and_lock(lock);
3903 lvb_ready = ldlm_is_lvb_ready(lock);
3904 unlock_res_and_lock(lock);
3905 /* checking lvb_ready is racy but this is okay. The worst case is
3906 * that multi processes may configure the file on the same time. */
3908 if (lvb_ready || !reconf) {
3911 /* layout_gen must be valid if layout lock is not
3912 * cancelled and stripe has already set */
3913 *gen = ll_layout_version_get(lli);
3919 rc = ll_layout_fetch(inode, lock);
3923 /* for layout lock, lmm is returned in lock's lvb.
3924 * lvb_data is immutable if the lock is held so it's safe to access it
3925 * without res lock. See the description in ldlm_lock_decref_internal()
3926 * for the condition to free lvb_data of layout lock */
3927 if (lock->l_lvb_data != NULL) {
3928 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3929 lock->l_lvb_data, lock->l_lvb_len);
3931 *gen = LL_LAYOUT_GEN_EMPTY;
3933 *gen = md.lsm->lsm_layout_gen;
3936 CERROR("%s: file "DFID" unpackmd error: %d\n",
3937 ll_get_fsname(inode->i_sb, NULL, 0),
3938 PFID(&lli->lli_fid), rc);
3944 /* set layout to file. Unlikely this will fail as old layout was
3945 * surely eliminated */
3946 memset(&conf, 0, sizeof conf);
3947 conf.coc_opc = OBJECT_CONF_SET;
3948 conf.coc_inode = inode;
3949 conf.coc_lock = lock;
3950 conf.u.coc_md = &md;
3951 rc = ll_layout_conf(inode, &conf);
3954 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3956 /* refresh layout failed, need to wait */
3957 wait_layout = rc == -EBUSY;
3961 LDLM_LOCK_PUT(lock);
3962 ldlm_lock_decref(lockh, mode);
3964 /* wait for IO to complete if it's still being used. */
3966 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3967 ll_get_fsname(inode->i_sb, NULL, 0),
3968 PFID(&lli->lli_fid), inode);
3970 memset(&conf, 0, sizeof conf);
3971 conf.coc_opc = OBJECT_CONF_WAIT;
3972 conf.coc_inode = inode;
3973 rc = ll_layout_conf(inode, &conf);
3977 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3978 ll_get_fsname(inode->i_sb, NULL, 0),
3979 PFID(&lli->lli_fid), rc);
3985 * This function checks if there exists a LAYOUT lock on the client side,
3986 * or enqueues it if it doesn't have one in cache.
3988 * This function will not hold layout lock so it may be revoked any time after
3989 * this function returns. Any operations depend on layout should be redone
3992 * This function should be called before lov_io_init() to get an uptodate
3993 * layout version, the caller should save the version number and after IO
3994 * is finished, this function should be called again to verify that layout
3995 * is not changed during IO time.
3997 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3999 struct ll_inode_info *lli = ll_i2info(inode);
4000 struct ll_sb_info *sbi = ll_i2sbi(inode);
4001 struct md_op_data *op_data;
4002 struct lookup_intent it;
4003 struct lustre_handle lockh;
4005 struct ldlm_enqueue_info einfo = {
4006 .ei_type = LDLM_IBITS,
4008 .ei_cb_bl = &ll_md_blocking_ast,
4009 .ei_cb_cp = &ldlm_completion_ast,
4014 *gen = ll_layout_version_get(lli);
4015 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4019 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4020 LASSERT(S_ISREG(inode->i_mode));
4022 /* take layout lock mutex to enqueue layout lock exclusively. */
4023 mutex_lock(&lli->lli_layout_mutex);
4026 /* mostly layout lock is caching on the local side, so try to match
4027 * it before grabbing layout lock mutex. */
4028 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4029 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4030 if (mode != 0) { /* hit cached lock */
4031 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4035 mutex_unlock(&lli->lli_layout_mutex);
4039 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4040 0, 0, LUSTRE_OPC_ANY, NULL);
4041 if (IS_ERR(op_data)) {
4042 mutex_unlock(&lli->lli_layout_mutex);
4043 RETURN(PTR_ERR(op_data));
4046 /* have to enqueue one */
4047 memset(&it, 0, sizeof(it));
4048 it.it_op = IT_LAYOUT;
4049 lockh.cookie = 0ULL;
4051 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
4052 ll_get_fsname(inode->i_sb, NULL, 0),
4053 PFID(&lli->lli_fid), inode);
4055 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4056 if (it.d.lustre.it_data != NULL)
4057 ptlrpc_req_finished(it.d.lustre.it_data);
4058 it.d.lustre.it_data = NULL;
4060 ll_finish_md_op_data(op_data);
4062 mode = it.d.lustre.it_lock_mode;
4063 it.d.lustre.it_lock_mode = 0;
4064 ll_intent_drop_lock(&it);
4067 /* set lock data in case this is a new lock */
4068 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4069 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4073 mutex_unlock(&lli->lli_layout_mutex);
4079 * This function send a restore request to the MDT
4081 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4083 struct hsm_user_request *hur;
4087 len = sizeof(struct hsm_user_request) +
4088 sizeof(struct hsm_user_item);
4089 OBD_ALLOC(hur, len);
4093 hur->hur_request.hr_action = HUA_RESTORE;
4094 hur->hur_request.hr_archive_id = 0;
4095 hur->hur_request.hr_flags = 0;
4096 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4097 sizeof(hur->hur_user_item[0].hui_fid));
4098 hur->hur_user_item[0].hui_extent.offset = offset;
4099 hur->hur_user_item[0].hui_extent.length = length;
4100 hur->hur_request.hr_itemcount = 1;
4101 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,