4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1026 /* merge timestamps the most recently obtained from mds with
1027 timestamps obtained from osts */
1028 LTIME_S(inode->i_atime) = lli->lli_atime;
1029 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1030 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1032 atime = LTIME_S(inode->i_atime);
1033 mtime = LTIME_S(inode->i_mtime);
1034 ctime = LTIME_S(inode->i_ctime);
1036 cl_object_attr_lock(obj);
1037 rc = cl_object_attr_get(env, obj, attr);
1038 cl_object_attr_unlock(obj);
1041 GOTO(out_size_unlock, rc);
1043 if (atime < attr->cat_atime)
1044 atime = attr->cat_atime;
1046 if (ctime < attr->cat_ctime)
1047 ctime = attr->cat_ctime;
1049 if (mtime < attr->cat_mtime)
1050 mtime = attr->cat_mtime;
1052 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1053 PFID(&lli->lli_fid), attr->cat_size);
1055 i_size_write(inode, attr->cat_size);
1056 inode->i_blocks = attr->cat_blocks;
1058 LTIME_S(inode->i_atime) = atime;
1059 LTIME_S(inode->i_mtime) = mtime;
1060 LTIME_S(inode->i_ctime) = ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1144 struct range_lock range;
1147 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1148 file->f_dentry->d_name.name, iot, *ppos, count);
1151 io = ccc_env_thread_io(env);
1152 ll_io_init(io, file, iot == CIT_WRITE);
1154 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1155 struct vvp_io *vio = vvp_env_io(env);
1156 struct ccc_io *cio = ccc_env_io(env);
1157 bool range_locked = false;
1159 if (file->f_flags & O_APPEND)
1160 range_lock_init(&range, 0, LUSTRE_EOF);
1162 range_lock_init(&range, *ppos, *ppos + count - 1);
1163 cio->cui_fd = LUSTRE_FPRIVATE(file);
1164 vio->cui_io_subtype = args->via_io_subtype;
1166 switch (vio->cui_io_subtype) {
1168 cio->cui_iov = args->u.normal.via_iov;
1169 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1170 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1171 cio->cui_iocb = args->u.normal.via_iocb;
1172 if ((iot == CIT_WRITE) &&
1173 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1174 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1176 result = range_lock(&lli->lli_write_tree,
1181 range_locked = true;
1183 down_read(&lli->lli_trunc_sem);
1186 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1187 vio->u.splice.cui_flags = args->u.splice.via_flags;
1190 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1194 ll_cl_add(file, env, io);
1195 result = cl_io_loop(env, io);
1196 ll_cl_remove(file, env);
1198 if (args->via_io_subtype == IO_NORMAL)
1199 up_read(&lli->lli_trunc_sem);
1201 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1203 range_unlock(&lli->lli_write_tree, &range);
1206 /* cl_io_rw_init() handled IO */
1207 result = io->ci_result;
1210 if (io->ci_nob > 0) {
1211 result = io->ci_nob;
1212 *ppos = io->u.ci_wr.wr.crw_pos;
1216 cl_io_fini(env, io);
1217 /* If any bit been read/written (result != 0), we just return
1218 * short read/write instead of restart io. */
1219 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1220 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1221 iot == CIT_READ ? "read" : "write",
1222 file->f_dentry->d_name.name, *ppos, count);
1223 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1227 if (iot == CIT_READ) {
1229 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1230 LPROC_LL_READ_BYTES, result);
1231 } else if (iot == CIT_WRITE) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_WRITE_BYTES, result);
1235 fd->fd_write_failed = false;
1236 } else if (result != -ERESTARTSYS) {
1237 fd->fd_write_failed = true;
1240 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1247 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1249 static int ll_file_get_iov_count(const struct iovec *iov,
1250 unsigned long *nr_segs, size_t *count)
1255 for (seg = 0; seg < *nr_segs; seg++) {
1256 const struct iovec *iv = &iov[seg];
1259 * If any segment has a negative length, or the cumulative
1260 * length ever wraps negative then return -EINVAL.
1263 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1265 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1270 cnt -= iv->iov_len; /* This segment is no good */
1277 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1278 unsigned long nr_segs, loff_t pos)
1281 struct vvp_io_args *args;
1287 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1291 env = cl_env_get(&refcheck);
1293 RETURN(PTR_ERR(env));
1295 args = vvp_env_args(env, IO_NORMAL);
1296 args->u.normal.via_iov = (struct iovec *)iov;
1297 args->u.normal.via_nrsegs = nr_segs;
1298 args->u.normal.via_iocb = iocb;
1300 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1301 &iocb->ki_pos, count);
1302 cl_env_put(env, &refcheck);
1306 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1310 struct iovec *local_iov;
1311 struct kiocb *kiocb;
1316 env = cl_env_get(&refcheck);
1318 RETURN(PTR_ERR(env));
1320 local_iov = &vvp_env_info(env)->vti_local_iov;
1321 kiocb = &vvp_env_info(env)->vti_kiocb;
1322 local_iov->iov_base = (void __user *)buf;
1323 local_iov->iov_len = count;
1324 init_sync_kiocb(kiocb, file);
1325 kiocb->ki_pos = *ppos;
1326 #ifdef HAVE_KIOCB_KI_LEFT
1327 kiocb->ki_left = count;
1329 kiocb->ki_nbytes = count;
1332 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1333 *ppos = kiocb->ki_pos;
1335 cl_env_put(env, &refcheck);
1340 * Write to a file (through the page cache).
1343 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1344 unsigned long nr_segs, loff_t pos)
1347 struct vvp_io_args *args;
1353 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1357 env = cl_env_get(&refcheck);
1359 RETURN(PTR_ERR(env));
1361 args = vvp_env_args(env, IO_NORMAL);
1362 args->u.normal.via_iov = (struct iovec *)iov;
1363 args->u.normal.via_nrsegs = nr_segs;
1364 args->u.normal.via_iocb = iocb;
1366 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1367 &iocb->ki_pos, count);
1368 cl_env_put(env, &refcheck);
1372 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1373 size_t count, loff_t *ppos)
1376 struct iovec *local_iov;
1377 struct kiocb *kiocb;
1382 env = cl_env_get(&refcheck);
1384 RETURN(PTR_ERR(env));
1386 local_iov = &vvp_env_info(env)->vti_local_iov;
1387 kiocb = &vvp_env_info(env)->vti_kiocb;
1388 local_iov->iov_base = (void __user *)buf;
1389 local_iov->iov_len = count;
1390 init_sync_kiocb(kiocb, file);
1391 kiocb->ki_pos = *ppos;
1392 #ifdef HAVE_KIOCB_KI_LEFT
1393 kiocb->ki_left = count;
1395 kiocb->ki_nbytes = count;
1398 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1399 *ppos = kiocb->ki_pos;
1401 cl_env_put(env, &refcheck);
1406 * Send file content (through pagecache) somewhere with helper
1408 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1409 struct pipe_inode_info *pipe, size_t count,
1413 struct vvp_io_args *args;
1418 env = cl_env_get(&refcheck);
1420 RETURN(PTR_ERR(env));
1422 args = vvp_env_args(env, IO_SPLICE);
1423 args->u.splice.via_pipe = pipe;
1424 args->u.splice.via_flags = flags;
1426 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1427 cl_env_put(env, &refcheck);
1431 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1432 __u64 flags, struct lov_user_md *lum,
1435 struct lov_stripe_md *lsm = NULL;
1436 struct lookup_intent oit = {
1438 .it_flags = flags | MDS_OPEN_BY_FID,
1443 lsm = ccc_inode_lsm_get(inode);
1445 ccc_inode_lsm_put(inode, lsm);
1446 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1447 PFID(ll_inode2fid(inode)));
1448 GOTO(out, rc = -EEXIST);
1451 ll_inode_size_lock(inode);
1452 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1454 GOTO(out_unlock, rc);
1456 rc = oit.d.lustre.it_status;
1458 GOTO(out_unlock, rc);
1460 ll_release_openhandle(file->f_dentry, &oit);
1463 ll_inode_size_unlock(inode);
1464 ll_intent_release(&oit);
1465 ccc_inode_lsm_put(inode, lsm);
1467 cl_lov_delay_create_clear(&file->f_flags);
1472 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1473 struct lov_mds_md **lmmp, int *lmm_size,
1474 struct ptlrpc_request **request)
1476 struct ll_sb_info *sbi = ll_i2sbi(inode);
1477 struct mdt_body *body;
1478 struct lov_mds_md *lmm = NULL;
1479 struct ptlrpc_request *req = NULL;
1480 struct md_op_data *op_data;
1483 rc = ll_get_default_mdsize(sbi, &lmmsize);
1487 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1488 strlen(filename), lmmsize,
1489 LUSTRE_OPC_ANY, NULL);
1490 if (IS_ERR(op_data))
1491 RETURN(PTR_ERR(op_data));
1493 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1494 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1495 ll_finish_md_op_data(op_data);
1497 CDEBUG(D_INFO, "md_getattr_name failed "
1498 "on %s: rc %d\n", filename, rc);
1502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1503 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1505 lmmsize = body->mbo_eadatasize;
1507 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1509 GOTO(out, rc = -ENODATA);
1512 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1513 LASSERT(lmm != NULL);
1515 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1516 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1517 GOTO(out, rc = -EPROTO);
1521 * This is coming from the MDS, so is probably in
1522 * little endian. We convert it to host endian before
1523 * passing it to userspace.
1525 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1528 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1529 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1532 /* if function called for directory - we should
1533 * avoid swab not existent lsm objects */
1534 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1535 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1540 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1541 lustre_swab_lov_user_md_v3(
1542 (struct lov_user_md_v3 *)lmm);
1543 if (S_ISREG(body->mbo_mode))
1544 lustre_swab_lov_user_md_objects(
1545 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1552 *lmm_size = lmmsize;
1557 static int ll_lov_setea(struct inode *inode, struct file *file,
1560 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1561 struct lov_user_md *lump;
1562 int lum_size = sizeof(struct lov_user_md) +
1563 sizeof(struct lov_user_ost_data);
1567 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1570 OBD_ALLOC_LARGE(lump, lum_size);
1574 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1575 OBD_FREE_LARGE(lump, lum_size);
1579 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1581 OBD_FREE_LARGE(lump, lum_size);
1585 static int ll_file_getstripe(struct inode *inode,
1586 struct lov_user_md __user *lum)
1593 env = cl_env_get(&refcheck);
1595 RETURN(PTR_ERR(env));
1597 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1598 cl_env_put(env, &refcheck);
1602 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1605 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1606 struct lov_user_md *klum;
1608 __u64 flags = FMODE_WRITE;
1611 rc = ll_copy_user_md(lum, &klum);
1616 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1620 put_user(0, &lum->lmm_stripe_count);
1622 ll_layout_refresh(inode, &gen);
1623 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1626 OBD_FREE(klum, lum_size);
1631 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1633 struct ll_inode_info *lli = ll_i2info(inode);
1634 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1635 struct ccc_grouplock grouplock;
1640 CWARN("group id for group lock must not be 0\n");
1644 if (ll_file_nolock(file))
1645 RETURN(-EOPNOTSUPP);
1647 spin_lock(&lli->lli_lock);
1648 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1649 CWARN("group lock already existed with gid %lu\n",
1650 fd->fd_grouplock.cg_gid);
1651 spin_unlock(&lli->lli_lock);
1654 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1655 spin_unlock(&lli->lli_lock);
1657 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1658 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1662 spin_lock(&lli->lli_lock);
1663 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1664 spin_unlock(&lli->lli_lock);
1665 CERROR("another thread just won the race\n");
1666 cl_put_grouplock(&grouplock);
1670 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1671 fd->fd_grouplock = grouplock;
1672 spin_unlock(&lli->lli_lock);
1674 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1678 static int ll_put_grouplock(struct inode *inode, struct file *file,
1681 struct ll_inode_info *lli = ll_i2info(inode);
1682 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1683 struct ccc_grouplock grouplock;
1686 spin_lock(&lli->lli_lock);
1687 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1688 spin_unlock(&lli->lli_lock);
1689 CWARN("no group lock held\n");
1692 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1694 if (fd->fd_grouplock.cg_gid != arg) {
1695 CWARN("group lock %lu doesn't match current id %lu\n",
1696 arg, fd->fd_grouplock.cg_gid);
1697 spin_unlock(&lli->lli_lock);
1701 grouplock = fd->fd_grouplock;
1702 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1703 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1704 spin_unlock(&lli->lli_lock);
1706 cl_put_grouplock(&grouplock);
1707 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1712 * Close inode open handle
1714 * \param dentry [in] dentry which contains the inode
1715 * \param it [in,out] intent which contains open info and result
1718 * \retval <0 failure
1720 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1722 struct inode *inode = dentry->d_inode;
1723 struct obd_client_handle *och;
1729 /* Root ? Do nothing. */
1730 if (dentry->d_inode->i_sb->s_root == dentry)
1733 /* No open handle to close? Move away */
1734 if (!it_disposition(it, DISP_OPEN_OPEN))
1737 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1739 OBD_ALLOC(och, sizeof(*och));
1741 GOTO(out, rc = -ENOMEM);
1743 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1745 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1748 /* this one is in place of ll_file_open */
1749 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1750 ptlrpc_req_finished(it->d.lustre.it_data);
1751 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1757 * Get size for inode for which FIEMAP mapping is requested.
1758 * Make the FIEMAP get_info call and returns the result.
1760 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1763 struct obd_export *exp = ll_i2dtexp(inode);
1764 struct lov_stripe_md *lsm = NULL;
1765 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1766 __u32 vallen = num_bytes;
1770 /* Checks for fiemap flags */
1771 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1772 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1776 /* Check for FIEMAP_FLAG_SYNC */
1777 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1778 rc = filemap_fdatawrite(inode->i_mapping);
1783 lsm = ccc_inode_lsm_get(inode);
1787 /* If the stripe_count > 1 and the application does not understand
1788 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1790 if (lsm->lsm_stripe_count > 1 &&
1791 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1792 GOTO(out, rc = -EOPNOTSUPP);
1794 fm_key.oa.o_oi = lsm->lsm_oi;
1795 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1797 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1798 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1799 /* If filesize is 0, then there would be no objects for mapping */
1800 if (fm_key.oa.o_size == 0) {
1801 fiemap->fm_mapped_extents = 0;
1805 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1807 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1810 CERROR("obd_get_info failed: rc = %d\n", rc);
1813 ccc_inode_lsm_put(inode, lsm);
1817 int ll_fid2path(struct inode *inode, void __user *arg)
1819 struct obd_export *exp = ll_i2mdexp(inode);
1820 const struct getinfo_fid2path __user *gfin = arg;
1822 struct getinfo_fid2path *gfout;
1828 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1829 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1832 /* Only need to get the buflen */
1833 if (get_user(pathlen, &gfin->gf_pathlen))
1836 if (pathlen > PATH_MAX)
1839 outsize = sizeof(*gfout) + pathlen;
1840 OBD_ALLOC(gfout, outsize);
1844 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1845 GOTO(gf_free, rc = -EFAULT);
1847 /* Call mdc_iocontrol */
1848 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1852 if (copy_to_user(arg, gfout, outsize))
1856 OBD_FREE(gfout, outsize);
1860 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1862 struct ll_user_fiemap *fiemap_s;
1863 size_t num_bytes, ret_bytes;
1864 unsigned int extent_count;
1867 /* Get the extent count so we can calculate the size of
1868 * required fiemap buffer */
1869 if (get_user(extent_count,
1870 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1874 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1876 num_bytes = sizeof(*fiemap_s) + (extent_count *
1877 sizeof(struct ll_fiemap_extent));
1879 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1880 if (fiemap_s == NULL)
1883 /* get the fiemap value */
1884 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1886 GOTO(error, rc = -EFAULT);
1888 /* If fm_extent_count is non-zero, read the first extent since
1889 * it is used to calculate end_offset and device from previous
1892 if (copy_from_user(&fiemap_s->fm_extents[0],
1893 (char __user *)arg + sizeof(*fiemap_s),
1894 sizeof(struct ll_fiemap_extent)))
1895 GOTO(error, rc = -EFAULT);
1898 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1902 ret_bytes = sizeof(struct ll_user_fiemap);
1904 if (extent_count != 0)
1905 ret_bytes += (fiemap_s->fm_mapped_extents *
1906 sizeof(struct ll_fiemap_extent));
1908 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1912 OBD_FREE_LARGE(fiemap_s, num_bytes);
1917 * Read the data_version for inode.
1919 * This value is computed using stripe object version on OST.
1920 * Version is computed using server side locking.
1922 * @param sync if do sync on the OST side;
1924 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1925 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1927 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1929 struct lov_stripe_md *lsm = NULL;
1930 struct ll_sb_info *sbi = ll_i2sbi(inode);
1931 struct obdo *obdo = NULL;
1935 /* If no stripe, we consider version is 0. */
1936 lsm = ccc_inode_lsm_get(inode);
1937 if (!lsm_has_objects(lsm)) {
1939 CDEBUG(D_INODE, "No object for inode\n");
1943 OBD_ALLOC_PTR(obdo);
1945 GOTO(out, rc = -ENOMEM);
1947 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1949 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1952 *data_version = obdo->o_data_version;
1958 ccc_inode_lsm_put(inode, lsm);
1963 * Trigger a HSM release request for the provided inode.
1965 int ll_hsm_release(struct inode *inode)
1967 struct cl_env_nest nest;
1969 struct obd_client_handle *och = NULL;
1970 __u64 data_version = 0;
1974 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1975 ll_get_fsname(inode->i_sb, NULL, 0),
1976 PFID(&ll_i2info(inode)->lli_fid));
1978 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1980 GOTO(out, rc = PTR_ERR(och));
1982 /* Grab latest data_version and [am]time values */
1983 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1987 env = cl_env_nested_get(&nest);
1989 GOTO(out, rc = PTR_ERR(env));
1991 ll_merge_attr(env, inode);
1992 cl_env_nested_put(&nest, env);
1994 /* Release the file.
1995 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1996 * we still need it to pack l_remote_handle to MDT. */
1997 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2003 if (och != NULL && !IS_ERR(och)) /* close the file */
2004 ll_lease_close(och, inode, NULL);
2009 struct ll_swap_stack {
2010 struct iattr ia1, ia2;
2012 struct inode *inode1, *inode2;
2013 bool check_dv1, check_dv2;
2016 static int ll_swap_layouts(struct file *file1, struct file *file2,
2017 struct lustre_swap_layouts *lsl)
2019 struct mdc_swap_layouts msl;
2020 struct md_op_data *op_data;
2023 struct ll_swap_stack *llss = NULL;
2026 OBD_ALLOC_PTR(llss);
2030 llss->inode1 = file1->f_dentry->d_inode;
2031 llss->inode2 = file2->f_dentry->d_inode;
2033 if (!S_ISREG(llss->inode2->i_mode))
2034 GOTO(free, rc = -EINVAL);
2036 if (inode_permission(llss->inode1, MAY_WRITE) ||
2037 inode_permission(llss->inode2, MAY_WRITE))
2038 GOTO(free, rc = -EPERM);
2040 if (llss->inode2->i_sb != llss->inode1->i_sb)
2041 GOTO(free, rc = -EXDEV);
2043 /* we use 2 bool because it is easier to swap than 2 bits */
2044 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2045 llss->check_dv1 = true;
2047 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2048 llss->check_dv2 = true;
2050 /* we cannot use lsl->sl_dvX directly because we may swap them */
2051 llss->dv1 = lsl->sl_dv1;
2052 llss->dv2 = lsl->sl_dv2;
2054 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2055 if (rc == 0) /* same file, done! */
2058 if (rc < 0) { /* sequentialize it */
2059 swap(llss->inode1, llss->inode2);
2061 swap(llss->dv1, llss->dv2);
2062 swap(llss->check_dv1, llss->check_dv2);
2066 if (gid != 0) { /* application asks to flush dirty cache */
2067 rc = ll_get_grouplock(llss->inode1, file1, gid);
2071 rc = ll_get_grouplock(llss->inode2, file2, gid);
2073 ll_put_grouplock(llss->inode1, file1, gid);
2078 /* to be able to restore mtime and atime after swap
2079 * we need to first save them */
2081 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2082 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2083 llss->ia1.ia_atime = llss->inode1->i_atime;
2084 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2085 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2086 llss->ia2.ia_atime = llss->inode2->i_atime;
2087 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2090 /* ultimate check, before swaping the layouts we check if
2091 * dataversion has changed (if requested) */
2092 if (llss->check_dv1) {
2093 rc = ll_data_version(llss->inode1, &dv, 0);
2096 if (dv != llss->dv1)
2097 GOTO(putgl, rc = -EAGAIN);
2100 if (llss->check_dv2) {
2101 rc = ll_data_version(llss->inode2, &dv, 0);
2104 if (dv != llss->dv2)
2105 GOTO(putgl, rc = -EAGAIN);
2108 /* struct md_op_data is used to send the swap args to the mdt
2109 * only flags is missing, so we use struct mdc_swap_layouts
2110 * through the md_op_data->op_data */
2111 /* flags from user space have to be converted before they are send to
2112 * server, no flag is sent today, they are only used on the client */
2115 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2116 0, LUSTRE_OPC_ANY, &msl);
2117 if (IS_ERR(op_data))
2118 GOTO(free, rc = PTR_ERR(op_data));
2120 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2121 sizeof(*op_data), op_data, NULL);
2122 ll_finish_md_op_data(op_data);
2126 ll_put_grouplock(llss->inode2, file2, gid);
2127 ll_put_grouplock(llss->inode1, file1, gid);
2130 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2134 /* clear useless flags */
2135 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2136 llss->ia1.ia_valid &= ~ATTR_MTIME;
2137 llss->ia2.ia_valid &= ~ATTR_MTIME;
2140 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2141 llss->ia1.ia_valid &= ~ATTR_ATIME;
2142 llss->ia2.ia_valid &= ~ATTR_ATIME;
2145 /* update time if requested */
2147 if (llss->ia2.ia_valid != 0) {
2148 mutex_lock(&llss->inode1->i_mutex);
2149 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2150 mutex_unlock(&llss->inode1->i_mutex);
2153 if (llss->ia1.ia_valid != 0) {
2156 mutex_lock(&llss->inode2->i_mutex);
2157 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2158 mutex_unlock(&llss->inode2->i_mutex);
2170 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2172 struct md_op_data *op_data;
2175 /* Non-root users are forbidden to set or clear flags which are
2176 * NOT defined in HSM_USER_MASK. */
2177 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2178 !cfs_capable(CFS_CAP_SYS_ADMIN))
2181 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2182 LUSTRE_OPC_ANY, hss);
2183 if (IS_ERR(op_data))
2184 RETURN(PTR_ERR(op_data));
2186 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2187 sizeof(*op_data), op_data, NULL);
2189 ll_finish_md_op_data(op_data);
2194 static int ll_hsm_import(struct inode *inode, struct file *file,
2195 struct hsm_user_import *hui)
2197 struct hsm_state_set *hss = NULL;
2198 struct iattr *attr = NULL;
2202 if (!S_ISREG(inode->i_mode))
2208 GOTO(out, rc = -ENOMEM);
2210 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2211 hss->hss_archive_id = hui->hui_archive_id;
2212 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2213 rc = ll_hsm_state_set(inode, hss);
2217 OBD_ALLOC_PTR(attr);
2219 GOTO(out, rc = -ENOMEM);
2221 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2222 attr->ia_mode |= S_IFREG;
2223 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2224 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2225 attr->ia_size = hui->hui_size;
2226 attr->ia_mtime.tv_sec = hui->hui_mtime;
2227 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2228 attr->ia_atime.tv_sec = hui->hui_atime;
2229 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2231 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2232 ATTR_UID | ATTR_GID |
2233 ATTR_MTIME | ATTR_MTIME_SET |
2234 ATTR_ATIME | ATTR_ATIME_SET;
2236 mutex_lock(&inode->i_mutex);
2238 rc = ll_setattr_raw(file->f_dentry, attr, true);
2242 mutex_unlock(&inode->i_mutex);
2254 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2256 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2257 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2261 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2263 struct inode *inode = file->f_dentry->d_inode;
2264 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2268 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2269 PFID(ll_inode2fid(inode)), inode, cmd);
2270 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2272 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2273 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2277 case LL_IOC_GETFLAGS:
2278 /* Get the current value of the file flags */
2279 return put_user(fd->fd_flags, (int __user *)arg);
2280 case LL_IOC_SETFLAGS:
2281 case LL_IOC_CLRFLAGS:
2282 /* Set or clear specific file flags */
2283 /* XXX This probably needs checks to ensure the flags are
2284 * not abused, and to handle any flag side effects.
2286 if (get_user(flags, (int __user *) arg))
2289 if (cmd == LL_IOC_SETFLAGS) {
2290 if ((flags & LL_FILE_IGNORE_LOCK) &&
2291 !(file->f_flags & O_DIRECT)) {
2292 CERROR("%s: unable to disable locking on "
2293 "non-O_DIRECT file\n", current->comm);
2297 fd->fd_flags |= flags;
2299 fd->fd_flags &= ~flags;
2302 case LL_IOC_LOV_SETSTRIPE:
2303 RETURN(ll_lov_setstripe(inode, file, arg));
2304 case LL_IOC_LOV_SETEA:
2305 RETURN(ll_lov_setea(inode, file, arg));
2306 case LL_IOC_LOV_SWAP_LAYOUTS: {
2308 struct lustre_swap_layouts lsl;
2310 if (copy_from_user(&lsl, (char __user *)arg,
2311 sizeof(struct lustre_swap_layouts)))
2314 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2317 file2 = fget(lsl.sl_fd);
2322 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2323 rc = ll_swap_layouts(file, file2, &lsl);
2327 case LL_IOC_LOV_GETSTRIPE:
2328 RETURN(ll_file_getstripe(inode,
2329 (struct lov_user_md __user *)arg));
2330 case FSFILT_IOC_FIEMAP:
2331 RETURN(ll_ioctl_fiemap(inode, arg));
2332 case FSFILT_IOC_GETFLAGS:
2333 case FSFILT_IOC_SETFLAGS:
2334 RETURN(ll_iocontrol(inode, file, cmd, arg));
2335 case FSFILT_IOC_GETVERSION_OLD:
2336 case FSFILT_IOC_GETVERSION:
2337 RETURN(put_user(inode->i_generation, (int __user *)arg));
2338 case LL_IOC_GROUP_LOCK:
2339 RETURN(ll_get_grouplock(inode, file, arg));
2340 case LL_IOC_GROUP_UNLOCK:
2341 RETURN(ll_put_grouplock(inode, file, arg));
2342 case IOC_OBD_STATFS:
2343 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2345 /* We need to special case any other ioctls we want to handle,
2346 * to send them to the MDS/OST as appropriate and to properly
2347 * network encode the arg field.
2348 case FSFILT_IOC_SETVERSION_OLD:
2349 case FSFILT_IOC_SETVERSION:
2351 case LL_IOC_FLUSHCTX:
2352 RETURN(ll_flush_ctx(inode));
2353 case LL_IOC_PATH2FID: {
2354 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2355 sizeof(struct lu_fid)))
2360 case LL_IOC_GETPARENT:
2361 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2363 case OBD_IOC_FID2PATH:
2364 RETURN(ll_fid2path(inode, (void __user *)arg));
2365 case LL_IOC_DATA_VERSION: {
2366 struct ioc_data_version idv;
2369 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2372 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2373 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2376 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2382 case LL_IOC_GET_MDTIDX: {
2385 mdtidx = ll_get_mdt_idx(inode);
2389 if (put_user((int)mdtidx, (int __user *)arg))
2394 case OBD_IOC_GETDTNAME:
2395 case OBD_IOC_GETMDNAME:
2396 RETURN(ll_get_obd_name(inode, cmd, arg));
2397 case LL_IOC_HSM_STATE_GET: {
2398 struct md_op_data *op_data;
2399 struct hsm_user_state *hus;
2406 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2407 LUSTRE_OPC_ANY, hus);
2408 if (IS_ERR(op_data)) {
2410 RETURN(PTR_ERR(op_data));
2413 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2416 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2419 ll_finish_md_op_data(op_data);
2423 case LL_IOC_HSM_STATE_SET: {
2424 struct hsm_state_set *hss;
2431 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2436 rc = ll_hsm_state_set(inode, hss);
2441 case LL_IOC_HSM_ACTION: {
2442 struct md_op_data *op_data;
2443 struct hsm_current_action *hca;
2450 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2451 LUSTRE_OPC_ANY, hca);
2452 if (IS_ERR(op_data)) {
2454 RETURN(PTR_ERR(op_data));
2457 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2460 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2463 ll_finish_md_op_data(op_data);
2467 case LL_IOC_SET_LEASE: {
2468 struct ll_inode_info *lli = ll_i2info(inode);
2469 struct obd_client_handle *och = NULL;
2474 case LL_LEASE_WRLCK:
2475 if (!(file->f_mode & FMODE_WRITE))
2477 fmode = FMODE_WRITE;
2479 case LL_LEASE_RDLCK:
2480 if (!(file->f_mode & FMODE_READ))
2484 case LL_LEASE_UNLCK:
2485 mutex_lock(&lli->lli_och_mutex);
2486 if (fd->fd_lease_och != NULL) {
2487 och = fd->fd_lease_och;
2488 fd->fd_lease_och = NULL;
2490 mutex_unlock(&lli->lli_och_mutex);
2495 fmode = och->och_flags;
2496 rc = ll_lease_close(och, inode, &lease_broken);
2503 RETURN(ll_lease_type_from_fmode(fmode));
2508 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2510 /* apply for lease */
2511 och = ll_lease_open(inode, file, fmode, 0);
2513 RETURN(PTR_ERR(och));
2516 mutex_lock(&lli->lli_och_mutex);
2517 if (fd->fd_lease_och == NULL) {
2518 fd->fd_lease_och = och;
2521 mutex_unlock(&lli->lli_och_mutex);
2523 /* impossible now that only excl is supported for now */
2524 ll_lease_close(och, inode, &lease_broken);
2529 case LL_IOC_GET_LEASE: {
2530 struct ll_inode_info *lli = ll_i2info(inode);
2531 struct ldlm_lock *lock = NULL;
2534 mutex_lock(&lli->lli_och_mutex);
2535 if (fd->fd_lease_och != NULL) {
2536 struct obd_client_handle *och = fd->fd_lease_och;
2538 lock = ldlm_handle2lock(&och->och_lease_handle);
2540 lock_res_and_lock(lock);
2541 if (!ldlm_is_cancel(lock))
2542 fmode = och->och_flags;
2544 unlock_res_and_lock(lock);
2545 LDLM_LOCK_PUT(lock);
2548 mutex_unlock(&lli->lli_och_mutex);
2550 RETURN(ll_lease_type_from_fmode(fmode));
2552 case LL_IOC_HSM_IMPORT: {
2553 struct hsm_user_import *hui;
2559 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2564 rc = ll_hsm_import(inode, file, hui);
2574 ll_iocontrol_call(inode, file, cmd, arg, &err))
2577 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2578 (void __user *)arg));
2583 #ifndef HAVE_FILE_LLSEEK_SIZE
2584 static inline loff_t
2585 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2587 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2589 if (offset > maxsize)
2592 if (offset != file->f_pos) {
2593 file->f_pos = offset;
2594 file->f_version = 0;
2600 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2601 loff_t maxsize, loff_t eof)
2603 struct inode *inode = file->f_dentry->d_inode;
2611 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2612 * position-querying operation. Avoid rewriting the "same"
2613 * f_pos value back to the file because a concurrent read(),
2614 * write() or lseek() might have altered it
2619 * f_lock protects against read/modify/write race with other
2620 * SEEK_CURs. Note that parallel writes and reads behave
2623 mutex_lock(&inode->i_mutex);
2624 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2625 mutex_unlock(&inode->i_mutex);
2629 * In the generic case the entire file is data, so as long as
2630 * offset isn't at the end of the file then the offset is data.
2637 * There is a virtual hole at the end of the file, so as long as
2638 * offset isn't i_size or larger, return i_size.
2646 return llseek_execute(file, offset, maxsize);
2650 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2652 struct inode *inode = file->f_dentry->d_inode;
2653 loff_t retval, eof = 0;
2656 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2657 (origin == SEEK_CUR) ? file->f_pos : 0);
2658 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2659 PFID(ll_inode2fid(inode)), inode, retval, retval,
2661 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2663 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2664 retval = ll_glimpse_size(inode);
2667 eof = i_size_read(inode);
2670 retval = ll_generic_file_llseek_size(file, offset, origin,
2671 ll_file_maxbytes(inode), eof);
2675 static int ll_flush(struct file *file, fl_owner_t id)
2677 struct inode *inode = file->f_dentry->d_inode;
2678 struct ll_inode_info *lli = ll_i2info(inode);
2679 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2682 LASSERT(!S_ISDIR(inode->i_mode));
2684 /* catch async errors that were recorded back when async writeback
2685 * failed for pages in this mapping. */
2686 rc = lli->lli_async_rc;
2687 lli->lli_async_rc = 0;
2688 if (lli->lli_clob != NULL) {
2689 err = lov_read_and_clear_async_rc(lli->lli_clob);
2694 /* The application has been told write failure already.
2695 * Do not report failure again. */
2696 if (fd->fd_write_failed)
2698 return rc ? -EIO : 0;
2702 * Called to make sure a portion of file has been written out.
2703 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2705 * Return how many pages have been written.
2707 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2708 enum cl_fsync_mode mode, int ignore_layout)
2710 struct cl_env_nest nest;
2713 struct obd_capa *capa = NULL;
2714 struct cl_fsync_io *fio;
2718 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2719 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2722 env = cl_env_nested_get(&nest);
2724 RETURN(PTR_ERR(env));
2726 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2728 io = ccc_env_thread_io(env);
2729 io->ci_obj = ll_i2info(inode)->lli_clob;
2730 io->ci_ignore_layout = ignore_layout;
2732 /* initialize parameters for sync */
2733 fio = &io->u.ci_fsync;
2734 fio->fi_capa = capa;
2735 fio->fi_start = start;
2737 fio->fi_fid = ll_inode2fid(inode);
2738 fio->fi_mode = mode;
2739 fio->fi_nr_written = 0;
2741 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2742 result = cl_io_loop(env, io);
2744 result = io->ci_result;
2746 result = fio->fi_nr_written;
2747 cl_io_fini(env, io);
2748 cl_env_nested_put(&nest, env);
2756 * When dentry is provided (the 'else' case), *file->f_dentry may be
2757 * null and dentry must be used directly rather than pulled from
2758 * *file->f_dentry as is done otherwise.
2761 #ifdef HAVE_FILE_FSYNC_4ARGS
2762 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2764 struct dentry *dentry = file->f_dentry;
2765 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2766 int ll_fsync(struct file *file, int datasync)
2768 struct dentry *dentry = file->f_dentry;
2770 loff_t end = LLONG_MAX;
2772 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2775 loff_t end = LLONG_MAX;
2777 struct inode *inode = dentry->d_inode;
2778 struct ll_inode_info *lli = ll_i2info(inode);
2779 struct ptlrpc_request *req;
2780 struct obd_capa *oc;
2784 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2785 PFID(ll_inode2fid(inode)), inode);
2786 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2788 #ifdef HAVE_FILE_FSYNC_4ARGS
2789 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2790 mutex_lock(&inode->i_mutex);
2792 /* fsync's caller has already called _fdata{sync,write}, we want
2793 * that IO to finish before calling the osc and mdc sync methods */
2794 rc = filemap_fdatawait(inode->i_mapping);
2797 /* catch async errors that were recorded back when async writeback
2798 * failed for pages in this mapping. */
2799 if (!S_ISDIR(inode->i_mode)) {
2800 err = lli->lli_async_rc;
2801 lli->lli_async_rc = 0;
2804 err = lov_read_and_clear_async_rc(lli->lli_clob);
2809 oc = ll_mdscapa_get(inode);
2810 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2816 ptlrpc_req_finished(req);
2818 if (S_ISREG(inode->i_mode)) {
2819 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2821 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2822 if (rc == 0 && err < 0)
2825 fd->fd_write_failed = true;
2827 fd->fd_write_failed = false;
2830 #ifdef HAVE_FILE_FSYNC_4ARGS
2831 mutex_unlock(&inode->i_mutex);
2837 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2839 struct inode *inode = file->f_dentry->d_inode;
2840 struct ll_sb_info *sbi = ll_i2sbi(inode);
2841 struct ldlm_enqueue_info einfo = {
2842 .ei_type = LDLM_FLOCK,
2843 .ei_cb_cp = ldlm_flock_completion_ast,
2844 .ei_cbdata = file_lock,
2846 struct md_op_data *op_data;
2847 struct lustre_handle lockh = {0};
2848 ldlm_policy_data_t flock = {{0}};
2849 int fl_type = file_lock->fl_type;
2855 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2856 PFID(ll_inode2fid(inode)), file_lock);
2858 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2860 if (file_lock->fl_flags & FL_FLOCK) {
2861 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2862 /* flocks are whole-file locks */
2863 flock.l_flock.end = OFFSET_MAX;
2864 /* For flocks owner is determined by the local file desctiptor*/
2865 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2866 } else if (file_lock->fl_flags & FL_POSIX) {
2867 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2868 flock.l_flock.start = file_lock->fl_start;
2869 flock.l_flock.end = file_lock->fl_end;
2873 flock.l_flock.pid = file_lock->fl_pid;
2875 /* Somewhat ugly workaround for svc lockd.
2876 * lockd installs custom fl_lmops->lm_compare_owner that checks
2877 * for the fl_owner to be the same (which it always is on local node
2878 * I guess between lockd processes) and then compares pid.
2879 * As such we assign pid to the owner field to make it all work,
2880 * conflict with normal locks is unlikely since pid space and
2881 * pointer space for current->files are not intersecting */
2882 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2883 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2887 einfo.ei_mode = LCK_PR;
2890 /* An unlock request may or may not have any relation to
2891 * existing locks so we may not be able to pass a lock handle
2892 * via a normal ldlm_lock_cancel() request. The request may even
2893 * unlock a byte range in the middle of an existing lock. In
2894 * order to process an unlock request we need all of the same
2895 * information that is given with a normal read or write record
2896 * lock request. To avoid creating another ldlm unlock (cancel)
2897 * message we'll treat a LCK_NL flock request as an unlock. */
2898 einfo.ei_mode = LCK_NL;
2901 einfo.ei_mode = LCK_PW;
2904 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2919 flags = LDLM_FL_BLOCK_NOWAIT;
2925 flags = LDLM_FL_TEST_LOCK;
2928 CERROR("unknown fcntl lock command: %d\n", cmd);
2932 /* Save the old mode so that if the mode in the lock changes we
2933 * can decrement the appropriate reader or writer refcount. */
2934 file_lock->fl_type = einfo.ei_mode;
2936 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2937 LUSTRE_OPC_ANY, NULL);
2938 if (IS_ERR(op_data))
2939 RETURN(PTR_ERR(op_data));
2941 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2942 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2943 flock.l_flock.pid, flags, einfo.ei_mode,
2944 flock.l_flock.start, flock.l_flock.end);
2946 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2949 /* Restore the file lock type if not TEST lock. */
2950 if (!(flags & LDLM_FL_TEST_LOCK))
2951 file_lock->fl_type = fl_type;
2953 if ((file_lock->fl_flags & FL_FLOCK) &&
2954 (rc == 0 || file_lock->fl_type == F_UNLCK))
2955 rc2 = flock_lock_file_wait(file, file_lock);
2956 if ((file_lock->fl_flags & FL_POSIX) &&
2957 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2958 !(flags & LDLM_FL_TEST_LOCK))
2959 rc2 = posix_lock_file_wait(file, file_lock);
2961 if (rc2 && file_lock->fl_type != F_UNLCK) {
2962 einfo.ei_mode = LCK_NL;
2963 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2968 ll_finish_md_op_data(op_data);
2973 int ll_get_fid_by_name(struct inode *parent, const char *name,
2974 int namelen, struct lu_fid *fid)
2976 struct md_op_data *op_data = NULL;
2977 struct mdt_body *body;
2978 struct ptlrpc_request *req;
2982 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2983 LUSTRE_OPC_ANY, NULL);
2984 if (IS_ERR(op_data))
2985 RETURN(PTR_ERR(op_data));
2987 op_data->op_valid = OBD_MD_FLID;
2988 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2989 ll_finish_md_op_data(op_data);
2993 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2995 GOTO(out_req, rc = -EFAULT);
2997 *fid = body->mbo_fid1;
2999 ptlrpc_req_finished(req);
3003 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3004 const char *name, int namelen)
3006 struct dentry *dchild = NULL;
3007 struct inode *child_inode = NULL;
3008 struct md_op_data *op_data;
3009 struct ptlrpc_request *request = NULL;
3014 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3015 name, PFID(ll_inode2fid(parent)), mdtidx);
3017 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3018 0, LUSTRE_OPC_ANY, NULL);
3019 if (IS_ERR(op_data))
3020 RETURN(PTR_ERR(op_data));
3022 /* Get child FID first */
3023 qstr.hash = full_name_hash(name, namelen);
3026 dchild = d_lookup(file->f_dentry, &qstr);
3027 if (dchild != NULL && dchild->d_inode != NULL) {
3028 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3029 if (dchild->d_inode != NULL) {
3030 child_inode = igrab(dchild->d_inode);
3031 ll_invalidate_aliases(child_inode);
3035 rc = ll_get_fid_by_name(parent, name, namelen,
3041 if (!fid_is_sane(&op_data->op_fid3)) {
3042 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3043 ll_get_fsname(parent->i_sb, NULL, 0), name,
3044 PFID(&op_data->op_fid3));
3045 GOTO(out_free, rc = -EINVAL);
3048 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3053 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3054 PFID(&op_data->op_fid3), mdtidx);
3055 GOTO(out_free, rc = 0);
3058 op_data->op_mds = mdtidx;
3059 op_data->op_cli_flags = CLI_MIGRATE;
3060 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3061 namelen, name, namelen, &request);
3063 ll_update_times(request, parent);
3065 ptlrpc_req_finished(request);
3070 if (child_inode != NULL) {
3071 clear_nlink(child_inode);
3075 ll_finish_md_op_data(op_data);
3080 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3088 * test if some locks matching bits and l_req_mode are acquired
3089 * - bits can be in different locks
3090 * - if found clear the common lock bits in *bits
3091 * - the bits not found, are kept in *bits
3093 * \param bits [IN] searched lock bits [IN]
3094 * \param l_req_mode [IN] searched lock mode
3095 * \retval boolean, true iff all bits are found
3097 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3099 struct lustre_handle lockh;
3100 ldlm_policy_data_t policy;
3101 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3102 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3111 fid = &ll_i2info(inode)->lli_fid;
3112 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3113 ldlm_lockname[mode]);
3115 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3116 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3117 policy.l_inodebits.bits = *bits & (1 << i);
3118 if (policy.l_inodebits.bits == 0)
3121 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3122 &policy, mode, &lockh)) {
3123 struct ldlm_lock *lock;
3125 lock = ldlm_handle2lock(&lockh);
3128 ~(lock->l_policy_data.l_inodebits.bits);
3129 LDLM_LOCK_PUT(lock);
3131 *bits &= ~policy.l_inodebits.bits;
3138 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3139 struct lustre_handle *lockh, __u64 flags,
3142 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3147 fid = &ll_i2info(inode)->lli_fid;
3148 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3150 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3151 fid, LDLM_IBITS, &policy, mode, lockh);
3156 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3158 /* Already unlinked. Just update nlink and return success */
3159 if (rc == -ENOENT) {
3161 /* This path cannot be hit for regular files unless in
3162 * case of obscure races, so no need to to validate
3164 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3166 } else if (rc != 0) {
3167 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3168 "%s: revalidate FID "DFID" error: rc = %d\n",
3169 ll_get_fsname(inode->i_sb, NULL, 0),
3170 PFID(ll_inode2fid(inode)), rc);
3176 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3178 struct inode *inode = dentry->d_inode;
3179 struct ptlrpc_request *req = NULL;
3180 struct obd_export *exp;
3184 LASSERT(inode != NULL);
3186 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3187 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3189 exp = ll_i2mdexp(inode);
3191 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3192 * But under CMD case, it caused some lock issues, should be fixed
3193 * with new CMD ibits lock. See bug 12718 */
3194 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3195 struct lookup_intent oit = { .it_op = IT_GETATTR };
3196 struct md_op_data *op_data;
3198 if (ibits == MDS_INODELOCK_LOOKUP)
3199 oit.it_op = IT_LOOKUP;
3201 /* Call getattr by fid, so do not provide name at all. */
3202 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3203 dentry->d_inode, NULL, 0, 0,
3204 LUSTRE_OPC_ANY, NULL);
3205 if (IS_ERR(op_data))
3206 RETURN(PTR_ERR(op_data));
3208 rc = md_intent_lock(exp, op_data, &oit, &req,
3209 &ll_md_blocking_ast, 0);
3210 ll_finish_md_op_data(op_data);
3212 rc = ll_inode_revalidate_fini(inode, rc);
3216 rc = ll_revalidate_it_finish(req, &oit, dentry);
3218 ll_intent_release(&oit);
3222 /* Unlinked? Unhash dentry, so it is not picked up later by
3223 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3224 here to preserve get_cwd functionality on 2.6.
3226 if (!dentry->d_inode->i_nlink)
3227 d_lustre_invalidate(dentry, 0);
3229 ll_lookup_finish_locks(&oit, dentry);
3230 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3231 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3232 obd_valid valid = OBD_MD_FLGETATTR;
3233 struct md_op_data *op_data;
3236 if (S_ISREG(inode->i_mode)) {
3237 rc = ll_get_default_mdsize(sbi, &ealen);
3240 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3243 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3244 0, ealen, LUSTRE_OPC_ANY,
3246 if (IS_ERR(op_data))
3247 RETURN(PTR_ERR(op_data));
3249 op_data->op_valid = valid;
3250 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3251 * capa for this inode. Because we only keep capas of dirs
3253 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3254 ll_finish_md_op_data(op_data);
3256 rc = ll_inode_revalidate_fini(inode, rc);
3260 rc = ll_prep_inode(&inode, req, NULL, NULL);
3263 ptlrpc_req_finished(req);
3267 static int ll_merge_md_attr(struct inode *inode)
3269 struct cl_attr attr = { 0 };
3272 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3273 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3278 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3279 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3281 ll_i2info(inode)->lli_atime = attr.cat_atime;
3282 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3283 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3289 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3291 struct inode *inode = dentry->d_inode;
3295 rc = __ll_inode_revalidate(dentry, ibits);
3299 /* if object isn't regular file, don't validate size */
3300 if (!S_ISREG(inode->i_mode)) {
3301 if (S_ISDIR(inode->i_mode) &&
3302 ll_i2info(inode)->lli_lsm_md != NULL) {
3303 rc = ll_merge_md_attr(inode);
3308 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3309 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3310 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3312 /* In case of restore, the MDT has the right size and has
3313 * already send it back without granting the layout lock,
3314 * inode is up-to-date so glimpse is useless.
3315 * Also to glimpse we need the layout, in case of a running
3316 * restore the MDT holds the layout lock so the glimpse will
3317 * block up to the end of restore (getattr will block)
3319 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3320 rc = ll_glimpse_size(inode);
3325 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3327 struct inode *inode = de->d_inode;
3328 struct ll_sb_info *sbi = ll_i2sbi(inode);
3329 struct ll_inode_info *lli = ll_i2info(inode);
3332 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3333 MDS_INODELOCK_LOOKUP);
3334 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3339 stat->dev = inode->i_sb->s_dev;
3340 if (ll_need_32bit_api(sbi))
3341 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3343 stat->ino = inode->i_ino;
3344 stat->mode = inode->i_mode;
3345 stat->uid = inode->i_uid;
3346 stat->gid = inode->i_gid;
3347 stat->rdev = inode->i_rdev;
3348 stat->atime = inode->i_atime;
3349 stat->mtime = inode->i_mtime;
3350 stat->ctime = inode->i_ctime;
3351 stat->blksize = 1 << inode->i_blkbits;
3352 stat->blocks = inode->i_blocks;
3354 if (S_ISDIR(inode->i_mode) &&
3355 ll_i2info(inode)->lli_lsm_md != NULL) {
3356 stat->nlink = lli->lli_stripe_dir_nlink;
3357 stat->size = lli->lli_stripe_dir_size;
3359 stat->nlink = inode->i_nlink;
3360 stat->size = i_size_read(inode);
3366 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3367 __u64 start, __u64 len)
3371 struct ll_user_fiemap *fiemap;
3372 unsigned int extent_count = fieinfo->fi_extents_max;
3374 num_bytes = sizeof(*fiemap) + (extent_count *
3375 sizeof(struct ll_fiemap_extent));
3376 OBD_ALLOC_LARGE(fiemap, num_bytes);
3381 fiemap->fm_flags = fieinfo->fi_flags;
3382 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3383 fiemap->fm_start = start;
3384 fiemap->fm_length = len;
3385 if (extent_count > 0)
3386 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3387 sizeof(struct ll_fiemap_extent));
3389 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3391 fieinfo->fi_flags = fiemap->fm_flags;
3392 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3393 if (extent_count > 0)
3394 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3395 fiemap->fm_mapped_extents *
3396 sizeof(struct ll_fiemap_extent));
3398 OBD_FREE_LARGE(fiemap, num_bytes);
3402 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3404 struct ll_inode_info *lli = ll_i2info(inode);
3405 struct posix_acl *acl = NULL;
3408 spin_lock(&lli->lli_lock);
3409 /* VFS' acl_permission_check->check_acl will release the refcount */
3410 acl = posix_acl_dup(lli->lli_posix_acl);
3411 spin_unlock(&lli->lli_lock);
3416 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3418 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3419 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3421 ll_check_acl(struct inode *inode, int mask)
3424 # ifdef CONFIG_FS_POSIX_ACL
3425 struct posix_acl *acl;
3429 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3430 if (flags & IPERM_FLAG_RCU)
3433 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3438 rc = posix_acl_permission(inode, acl, mask);
3439 posix_acl_release(acl);
3442 # else /* !CONFIG_FS_POSIX_ACL */
3444 # endif /* CONFIG_FS_POSIX_ACL */
3446 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3448 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3449 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3451 # ifdef HAVE_INODE_PERMISION_2ARGS
3452 int ll_inode_permission(struct inode *inode, int mask)
3454 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3459 struct ll_sb_info *sbi;
3460 struct root_squash_info *squash;
3461 struct cred *cred = NULL;
3462 const struct cred *old_cred = NULL;
3464 bool squash_id = false;
3467 #ifdef MAY_NOT_BLOCK
3468 if (mask & MAY_NOT_BLOCK)
3470 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3471 if (flags & IPERM_FLAG_RCU)
3475 /* as root inode are NOT getting validated in lookup operation,
3476 * need to do it before permission check. */
3478 if (inode == inode->i_sb->s_root->d_inode) {
3479 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3480 MDS_INODELOCK_LOOKUP);
3485 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3486 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3488 /* squash fsuid/fsgid if needed */
3489 sbi = ll_i2sbi(inode);
3490 squash = &sbi->ll_squash;
3491 if (unlikely(squash->rsi_uid != 0 &&
3492 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3493 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3497 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3498 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3499 squash->rsi_uid, squash->rsi_gid);
3501 /* update current process's credentials
3502 * and FS capability */
3503 cred = prepare_creds();
3507 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3508 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3509 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3510 if ((1 << cap) & CFS_CAP_FS_MASK)
3511 cap_lower(cred->cap_effective, cap);
3513 old_cred = override_creds(cred);
3516 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3518 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3519 rc = lustre_check_remote_perm(inode, mask);
3521 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3523 /* restore current process's credentials and FS capability */
3525 revert_creds(old_cred);
3532 /* -o localflock - only provides locally consistent flock locks */
3533 struct file_operations ll_file_operations = {
3534 .read = ll_file_read,
3535 .aio_read = ll_file_aio_read,
3536 .write = ll_file_write,
3537 .aio_write = ll_file_aio_write,
3538 .unlocked_ioctl = ll_file_ioctl,
3539 .open = ll_file_open,
3540 .release = ll_file_release,
3541 .mmap = ll_file_mmap,
3542 .llseek = ll_file_seek,
3543 .splice_read = ll_file_splice_read,
3548 struct file_operations ll_file_operations_flock = {
3549 .read = ll_file_read,
3550 .aio_read = ll_file_aio_read,
3551 .write = ll_file_write,
3552 .aio_write = ll_file_aio_write,
3553 .unlocked_ioctl = ll_file_ioctl,
3554 .open = ll_file_open,
3555 .release = ll_file_release,
3556 .mmap = ll_file_mmap,
3557 .llseek = ll_file_seek,
3558 .splice_read = ll_file_splice_read,
3561 .flock = ll_file_flock,
3562 .lock = ll_file_flock
3565 /* These are for -o noflock - to return ENOSYS on flock calls */
3566 struct file_operations ll_file_operations_noflock = {
3567 .read = ll_file_read,
3568 .aio_read = ll_file_aio_read,
3569 .write = ll_file_write,
3570 .aio_write = ll_file_aio_write,
3571 .unlocked_ioctl = ll_file_ioctl,
3572 .open = ll_file_open,
3573 .release = ll_file_release,
3574 .mmap = ll_file_mmap,
3575 .llseek = ll_file_seek,
3576 .splice_read = ll_file_splice_read,
3579 .flock = ll_file_noflock,
3580 .lock = ll_file_noflock
3583 struct inode_operations ll_file_inode_operations = {
3584 .setattr = ll_setattr,
3585 .getattr = ll_getattr,
3586 .permission = ll_inode_permission,
3587 .setxattr = ll_setxattr,
3588 .getxattr = ll_getxattr,
3589 .listxattr = ll_listxattr,
3590 .removexattr = ll_removexattr,
3591 .fiemap = ll_fiemap,
3592 #ifdef HAVE_IOP_GET_ACL
3593 .get_acl = ll_get_acl,
3597 /* dynamic ioctl number support routins */
3598 static struct llioc_ctl_data {
3599 struct rw_semaphore ioc_sem;
3600 struct list_head ioc_head;
3602 __RWSEM_INITIALIZER(llioc.ioc_sem),
3603 LIST_HEAD_INIT(llioc.ioc_head)
3608 struct list_head iocd_list;
3609 unsigned int iocd_size;
3610 llioc_callback_t iocd_cb;
3611 unsigned int iocd_count;
3612 unsigned int iocd_cmd[0];
3615 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3618 struct llioc_data *in_data = NULL;
3621 if (cb == NULL || cmd == NULL ||
3622 count > LLIOC_MAX_CMD || count < 0)
3625 size = sizeof(*in_data) + count * sizeof(unsigned int);
3626 OBD_ALLOC(in_data, size);
3627 if (in_data == NULL)
3630 memset(in_data, 0, sizeof(*in_data));
3631 in_data->iocd_size = size;
3632 in_data->iocd_cb = cb;
3633 in_data->iocd_count = count;
3634 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3636 down_write(&llioc.ioc_sem);
3637 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3638 up_write(&llioc.ioc_sem);
3643 void ll_iocontrol_unregister(void *magic)
3645 struct llioc_data *tmp;
3650 down_write(&llioc.ioc_sem);
3651 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3653 unsigned int size = tmp->iocd_size;
3655 list_del(&tmp->iocd_list);
3656 up_write(&llioc.ioc_sem);
3658 OBD_FREE(tmp, size);
3662 up_write(&llioc.ioc_sem);
3664 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3667 EXPORT_SYMBOL(ll_iocontrol_register);
3668 EXPORT_SYMBOL(ll_iocontrol_unregister);
3670 static enum llioc_iter
3671 ll_iocontrol_call(struct inode *inode, struct file *file,
3672 unsigned int cmd, unsigned long arg, int *rcp)
3674 enum llioc_iter ret = LLIOC_CONT;
3675 struct llioc_data *data;
3676 int rc = -EINVAL, i;
3678 down_read(&llioc.ioc_sem);
3679 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3680 for (i = 0; i < data->iocd_count; i++) {
3681 if (cmd != data->iocd_cmd[i])
3684 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3688 if (ret == LLIOC_STOP)
3691 up_read(&llioc.ioc_sem);
3698 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3700 struct ll_inode_info *lli = ll_i2info(inode);
3701 struct cl_env_nest nest;
3706 if (lli->lli_clob == NULL)
3709 env = cl_env_nested_get(&nest);
3711 RETURN(PTR_ERR(env));
3713 result = cl_conf_set(env, lli->lli_clob, conf);
3714 cl_env_nested_put(&nest, env);
3716 if (conf->coc_opc == OBJECT_CONF_SET) {
3717 struct ldlm_lock *lock = conf->coc_lock;
3719 LASSERT(lock != NULL);
3720 LASSERT(ldlm_has_layout(lock));
3722 struct lustre_md *md = conf->u.coc_md;
3723 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3725 /* it can only be allowed to match after layout is
3726 * applied to inode otherwise false layout would be
3727 * seen. Applying layout shoud happen before dropping
3728 * the intent lock. */
3729 ldlm_lock_allow_match(lock);
3731 lli->lli_has_smd = lsm_has_objects(md->lsm);
3732 if (md->lsm != NULL)
3733 gen = md->lsm->lsm_layout_gen;
3736 DFID ": layout version change: %u -> %u\n",
3737 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3739 ll_layout_version_set(lli, gen);
3745 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3746 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3749 struct ll_sb_info *sbi = ll_i2sbi(inode);
3750 struct obd_capa *oc;
3751 struct ptlrpc_request *req;
3752 struct mdt_body *body;
3759 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3760 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3761 lock->l_lvb_data, lock->l_lvb_len);
3763 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3766 /* if layout lock was granted right away, the layout is returned
3767 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3768 * blocked and then granted via completion ast, we have to fetch
3769 * layout here. Please note that we can't use the LVB buffer in
3770 * completion AST because it doesn't have a large enough buffer */
3771 oc = ll_mdscapa_get(inode);
3772 rc = ll_get_default_mdsize(sbi, &lmmsize);
3774 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3775 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3781 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3783 GOTO(out, rc = -EPROTO);
3785 lmmsize = body->mbo_eadatasize;
3786 if (lmmsize == 0) /* empty layout */
3789 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3791 GOTO(out, rc = -EFAULT);
3793 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3794 if (lvbdata == NULL)
3795 GOTO(out, rc = -ENOMEM);
3797 memcpy(lvbdata, lmm, lmmsize);
3798 lock_res_and_lock(lock);
3799 if (lock->l_lvb_data != NULL)
3800 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3802 lock->l_lvb_data = lvbdata;
3803 lock->l_lvb_len = lmmsize;
3804 unlock_res_and_lock(lock);
3809 ptlrpc_req_finished(req);
3814 * Apply the layout to the inode. Layout lock is held and will be released
3817 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3818 struct inode *inode, __u32 *gen, bool reconf)
3820 struct ll_inode_info *lli = ll_i2info(inode);
3821 struct ll_sb_info *sbi = ll_i2sbi(inode);
3822 struct ldlm_lock *lock;
3823 struct lustre_md md = { NULL };
3824 struct cl_object_conf conf;
3827 bool wait_layout = false;
3830 LASSERT(lustre_handle_is_used(lockh));
3832 lock = ldlm_handle2lock(lockh);
3833 LASSERT(lock != NULL);
3834 LASSERT(ldlm_has_layout(lock));
3836 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3837 PFID(&lli->lli_fid), inode, reconf);
3839 /* in case this is a caching lock and reinstate with new inode */
3840 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3842 lock_res_and_lock(lock);
3843 lvb_ready = ldlm_is_lvb_ready(lock);
3844 unlock_res_and_lock(lock);
3845 /* checking lvb_ready is racy but this is okay. The worst case is
3846 * that multi processes may configure the file on the same time. */
3848 if (lvb_ready || !reconf) {
3851 /* layout_gen must be valid if layout lock is not
3852 * cancelled and stripe has already set */
3853 *gen = ll_layout_version_get(lli);
3859 rc = ll_layout_fetch(inode, lock);
3863 /* for layout lock, lmm is returned in lock's lvb.
3864 * lvb_data is immutable if the lock is held so it's safe to access it
3865 * without res lock. See the description in ldlm_lock_decref_internal()
3866 * for the condition to free lvb_data of layout lock */
3867 if (lock->l_lvb_data != NULL) {
3868 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3869 lock->l_lvb_data, lock->l_lvb_len);
3871 *gen = LL_LAYOUT_GEN_EMPTY;
3873 *gen = md.lsm->lsm_layout_gen;
3876 CERROR("%s: file "DFID" unpackmd error: %d\n",
3877 ll_get_fsname(inode->i_sb, NULL, 0),
3878 PFID(&lli->lli_fid), rc);
3884 /* set layout to file. Unlikely this will fail as old layout was
3885 * surely eliminated */
3886 memset(&conf, 0, sizeof conf);
3887 conf.coc_opc = OBJECT_CONF_SET;
3888 conf.coc_inode = inode;
3889 conf.coc_lock = lock;
3890 conf.u.coc_md = &md;
3891 rc = ll_layout_conf(inode, &conf);
3894 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3896 /* refresh layout failed, need to wait */
3897 wait_layout = rc == -EBUSY;
3901 LDLM_LOCK_PUT(lock);
3902 ldlm_lock_decref(lockh, mode);
3904 /* wait for IO to complete if it's still being used. */
3906 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3907 ll_get_fsname(inode->i_sb, NULL, 0),
3908 PFID(&lli->lli_fid), inode);
3910 memset(&conf, 0, sizeof conf);
3911 conf.coc_opc = OBJECT_CONF_WAIT;
3912 conf.coc_inode = inode;
3913 rc = ll_layout_conf(inode, &conf);
3917 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3918 ll_get_fsname(inode->i_sb, NULL, 0),
3919 PFID(&lli->lli_fid), rc);
3925 * This function checks if there exists a LAYOUT lock on the client side,
3926 * or enqueues it if it doesn't have one in cache.
3928 * This function will not hold layout lock so it may be revoked any time after
3929 * this function returns. Any operations depend on layout should be redone
3932 * This function should be called before lov_io_init() to get an uptodate
3933 * layout version, the caller should save the version number and after IO
3934 * is finished, this function should be called again to verify that layout
3935 * is not changed during IO time.
3937 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3939 struct ll_inode_info *lli = ll_i2info(inode);
3940 struct ll_sb_info *sbi = ll_i2sbi(inode);
3941 struct md_op_data *op_data;
3942 struct lookup_intent it;
3943 struct lustre_handle lockh;
3945 struct ldlm_enqueue_info einfo = {
3946 .ei_type = LDLM_IBITS,
3948 .ei_cb_bl = &ll_md_blocking_ast,
3949 .ei_cb_cp = &ldlm_completion_ast,
3954 *gen = ll_layout_version_get(lli);
3955 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3959 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3960 LASSERT(S_ISREG(inode->i_mode));
3962 /* take layout lock mutex to enqueue layout lock exclusively. */
3963 mutex_lock(&lli->lli_layout_mutex);
3966 /* mostly layout lock is caching on the local side, so try to match
3967 * it before grabbing layout lock mutex. */
3968 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3969 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3970 if (mode != 0) { /* hit cached lock */
3971 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3975 mutex_unlock(&lli->lli_layout_mutex);
3979 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3980 0, 0, LUSTRE_OPC_ANY, NULL);
3981 if (IS_ERR(op_data)) {
3982 mutex_unlock(&lli->lli_layout_mutex);
3983 RETURN(PTR_ERR(op_data));
3986 /* have to enqueue one */
3987 memset(&it, 0, sizeof(it));
3988 it.it_op = IT_LAYOUT;
3989 lockh.cookie = 0ULL;
3991 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3992 ll_get_fsname(inode->i_sb, NULL, 0),
3993 PFID(&lli->lli_fid), inode);
3995 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3996 if (it.d.lustre.it_data != NULL)
3997 ptlrpc_req_finished(it.d.lustre.it_data);
3998 it.d.lustre.it_data = NULL;
4000 ll_finish_md_op_data(op_data);
4002 mode = it.d.lustre.it_lock_mode;
4003 it.d.lustre.it_lock_mode = 0;
4004 ll_intent_drop_lock(&it);
4007 /* set lock data in case this is a new lock */
4008 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4009 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4013 mutex_unlock(&lli->lli_layout_mutex);
4019 * This function send a restore request to the MDT
4021 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4023 struct hsm_user_request *hur;
4027 len = sizeof(struct hsm_user_request) +
4028 sizeof(struct hsm_user_item);
4029 OBD_ALLOC(hur, len);
4033 hur->hur_request.hr_action = HUA_RESTORE;
4034 hur->hur_request.hr_archive_id = 0;
4035 hur->hur_request.hr_flags = 0;
4036 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4037 sizeof(hur->hur_user_item[0].hui_fid));
4038 hur->hur_user_item[0].hui_extent.offset = offset;
4039 hur->hur_user_item[0].hui_extent.length = length;
4040 hur->hur_request.hr_itemcount = 1;
4041 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,