4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1026 /* merge timestamps the most recently obtained from mds with
1027 timestamps obtained from osts */
1028 LTIME_S(inode->i_atime) = lli->lli_atime;
1029 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1030 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1032 atime = LTIME_S(inode->i_atime);
1033 mtime = LTIME_S(inode->i_mtime);
1034 ctime = LTIME_S(inode->i_ctime);
1036 cl_object_attr_lock(obj);
1037 rc = cl_object_attr_get(env, obj, attr);
1038 cl_object_attr_unlock(obj);
1041 GOTO(out_size_unlock, rc);
1043 if (atime < attr->cat_atime)
1044 atime = attr->cat_atime;
1046 if (ctime < attr->cat_ctime)
1047 ctime = attr->cat_ctime;
1049 if (mtime < attr->cat_mtime)
1050 mtime = attr->cat_mtime;
1052 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1053 PFID(&lli->lli_fid), attr->cat_size);
1055 i_size_write(inode, attr->cat_size);
1056 inode->i_blocks = attr->cat_blocks;
1058 LTIME_S(inode->i_atime) = atime;
1059 LTIME_S(inode->i_mtime) = mtime;
1060 LTIME_S(inode->i_ctime) = ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1144 struct range_lock range;
1147 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1148 file->f_dentry->d_name.name, iot, *ppos, count);
1151 io = ccc_env_thread_io(env);
1152 ll_io_init(io, file, iot == CIT_WRITE);
1154 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1155 struct vvp_io *cio = vvp_env_io(env);
1156 bool range_locked = false;
1158 if (file->f_flags & O_APPEND)
1159 range_lock_init(&range, 0, LUSTRE_EOF);
1161 range_lock_init(&range, *ppos, *ppos + count - 1);
1163 cio->cui_fd = LUSTRE_FPRIVATE(file);
1164 cio->cui_io_subtype = args->via_io_subtype;
1166 switch (cio->cui_io_subtype) {
1168 cio->cui_iov = args->u.normal.via_iov;
1169 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1170 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1171 cio->cui_iocb = args->u.normal.via_iocb;
1172 if ((iot == CIT_WRITE) &&
1173 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1174 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1176 result = range_lock(&lli->lli_write_tree,
1181 range_locked = true;
1183 down_read(&lli->lli_trunc_sem);
1186 cio->u.splice.cui_pipe = args->u.splice.via_pipe;
1187 cio->u.splice.cui_flags = args->u.splice.via_flags;
1190 CERROR("unknown IO subtype %u\n", cio->cui_io_subtype);
1194 ll_cl_add(file, env, io);
1195 result = cl_io_loop(env, io);
1196 ll_cl_remove(file, env);
1198 if (args->via_io_subtype == IO_NORMAL)
1199 up_read(&lli->lli_trunc_sem);
1201 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1203 range_unlock(&lli->lli_write_tree, &range);
1206 /* cl_io_rw_init() handled IO */
1207 result = io->ci_result;
1210 if (io->ci_nob > 0) {
1211 result = io->ci_nob;
1212 *ppos = io->u.ci_wr.wr.crw_pos;
1216 cl_io_fini(env, io);
1217 /* If any bit been read/written (result != 0), we just return
1218 * short read/write instead of restart io. */
1219 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1220 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1221 iot == CIT_READ ? "read" : "write",
1222 file->f_dentry->d_name.name, *ppos, count);
1223 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1227 if (iot == CIT_READ) {
1229 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1230 LPROC_LL_READ_BYTES, result);
1231 } else if (iot == CIT_WRITE) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_WRITE_BYTES, result);
1235 fd->fd_write_failed = false;
1236 } else if (result != -ERESTARTSYS) {
1237 fd->fd_write_failed = true;
1240 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1247 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1249 static int ll_file_get_iov_count(const struct iovec *iov,
1250 unsigned long *nr_segs, size_t *count)
1255 for (seg = 0; seg < *nr_segs; seg++) {
1256 const struct iovec *iv = &iov[seg];
1259 * If any segment has a negative length, or the cumulative
1260 * length ever wraps negative then return -EINVAL.
1263 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1265 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1270 cnt -= iv->iov_len; /* This segment is no good */
1277 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1278 unsigned long nr_segs, loff_t pos)
1281 struct vvp_io_args *args;
1287 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1291 env = cl_env_get(&refcheck);
1293 RETURN(PTR_ERR(env));
1295 args = vvp_env_args(env, IO_NORMAL);
1296 args->u.normal.via_iov = (struct iovec *)iov;
1297 args->u.normal.via_nrsegs = nr_segs;
1298 args->u.normal.via_iocb = iocb;
1300 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1301 &iocb->ki_pos, count);
1302 cl_env_put(env, &refcheck);
1306 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1310 struct iovec *local_iov;
1311 struct kiocb *kiocb;
1316 env = cl_env_get(&refcheck);
1318 RETURN(PTR_ERR(env));
1320 local_iov = &vvp_env_info(env)->vti_local_iov;
1321 kiocb = &vvp_env_info(env)->vti_kiocb;
1322 local_iov->iov_base = (void __user *)buf;
1323 local_iov->iov_len = count;
1324 init_sync_kiocb(kiocb, file);
1325 kiocb->ki_pos = *ppos;
1326 #ifdef HAVE_KIOCB_KI_LEFT
1327 kiocb->ki_left = count;
1329 kiocb->ki_nbytes = count;
1332 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1333 *ppos = kiocb->ki_pos;
1335 cl_env_put(env, &refcheck);
1340 * Write to a file (through the page cache).
1343 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1344 unsigned long nr_segs, loff_t pos)
1347 struct vvp_io_args *args;
1353 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1357 env = cl_env_get(&refcheck);
1359 RETURN(PTR_ERR(env));
1361 args = vvp_env_args(env, IO_NORMAL);
1362 args->u.normal.via_iov = (struct iovec *)iov;
1363 args->u.normal.via_nrsegs = nr_segs;
1364 args->u.normal.via_iocb = iocb;
1366 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1367 &iocb->ki_pos, count);
1368 cl_env_put(env, &refcheck);
1372 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1373 size_t count, loff_t *ppos)
1376 struct iovec *local_iov;
1377 struct kiocb *kiocb;
1382 env = cl_env_get(&refcheck);
1384 RETURN(PTR_ERR(env));
1386 local_iov = &vvp_env_info(env)->vti_local_iov;
1387 kiocb = &vvp_env_info(env)->vti_kiocb;
1388 local_iov->iov_base = (void __user *)buf;
1389 local_iov->iov_len = count;
1390 init_sync_kiocb(kiocb, file);
1391 kiocb->ki_pos = *ppos;
1392 #ifdef HAVE_KIOCB_KI_LEFT
1393 kiocb->ki_left = count;
1395 kiocb->ki_nbytes = count;
1398 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1399 *ppos = kiocb->ki_pos;
1401 cl_env_put(env, &refcheck);
1406 * Send file content (through pagecache) somewhere with helper
1408 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1409 struct pipe_inode_info *pipe, size_t count,
1413 struct vvp_io_args *args;
1418 env = cl_env_get(&refcheck);
1420 RETURN(PTR_ERR(env));
1422 args = vvp_env_args(env, IO_SPLICE);
1423 args->u.splice.via_pipe = pipe;
1424 args->u.splice.via_flags = flags;
1426 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1427 cl_env_put(env, &refcheck);
1431 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1432 __u64 flags, struct lov_user_md *lum,
1435 struct lov_stripe_md *lsm = NULL;
1436 struct lookup_intent oit = {
1438 .it_flags = flags | MDS_OPEN_BY_FID,
1443 lsm = ccc_inode_lsm_get(inode);
1445 ccc_inode_lsm_put(inode, lsm);
1446 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1447 PFID(ll_inode2fid(inode)));
1448 GOTO(out, rc = -EEXIST);
1451 ll_inode_size_lock(inode);
1452 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1454 GOTO(out_unlock, rc);
1456 rc = oit.d.lustre.it_status;
1458 GOTO(out_unlock, rc);
1460 ll_release_openhandle(file->f_dentry, &oit);
1463 ll_inode_size_unlock(inode);
1464 ll_intent_release(&oit);
1465 ccc_inode_lsm_put(inode, lsm);
1467 cl_lov_delay_create_clear(&file->f_flags);
1472 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1473 struct lov_mds_md **lmmp, int *lmm_size,
1474 struct ptlrpc_request **request)
1476 struct ll_sb_info *sbi = ll_i2sbi(inode);
1477 struct mdt_body *body;
1478 struct lov_mds_md *lmm = NULL;
1479 struct ptlrpc_request *req = NULL;
1480 struct md_op_data *op_data;
1483 rc = ll_get_default_mdsize(sbi, &lmmsize);
1487 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1488 strlen(filename), lmmsize,
1489 LUSTRE_OPC_ANY, NULL);
1490 if (IS_ERR(op_data))
1491 RETURN(PTR_ERR(op_data));
1493 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1494 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1495 ll_finish_md_op_data(op_data);
1497 CDEBUG(D_INFO, "md_getattr_name failed "
1498 "on %s: rc %d\n", filename, rc);
1502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1503 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1505 lmmsize = body->mbo_eadatasize;
1507 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1509 GOTO(out, rc = -ENODATA);
1512 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1513 LASSERT(lmm != NULL);
1515 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1516 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1517 GOTO(out, rc = -EPROTO);
1521 * This is coming from the MDS, so is probably in
1522 * little endian. We convert it to host endian before
1523 * passing it to userspace.
1525 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1528 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1529 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1532 /* if function called for directory - we should
1533 * avoid swab not existent lsm objects */
1534 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1535 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1540 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1541 lustre_swab_lov_user_md_v3(
1542 (struct lov_user_md_v3 *)lmm);
1543 if (S_ISREG(body->mbo_mode))
1544 lustre_swab_lov_user_md_objects(
1545 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1552 *lmm_size = lmmsize;
1557 static int ll_lov_setea(struct inode *inode, struct file *file,
1560 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1561 struct lov_user_md *lump;
1562 int lum_size = sizeof(struct lov_user_md) +
1563 sizeof(struct lov_user_ost_data);
1567 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1570 OBD_ALLOC_LARGE(lump, lum_size);
1574 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1575 OBD_FREE_LARGE(lump, lum_size);
1579 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1581 OBD_FREE_LARGE(lump, lum_size);
1585 static int ll_file_getstripe(struct inode *inode,
1586 struct lov_user_md __user *lum)
1593 env = cl_env_get(&refcheck);
1595 RETURN(PTR_ERR(env));
1597 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1598 cl_env_put(env, &refcheck);
1602 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1605 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1606 struct lov_user_md *klum;
1608 __u64 flags = FMODE_WRITE;
1611 rc = ll_copy_user_md(lum, &klum);
1616 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1620 put_user(0, &lum->lmm_stripe_count);
1622 ll_layout_refresh(inode, &gen);
1623 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1626 OBD_FREE(klum, lum_size);
1631 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1633 struct ll_inode_info *lli = ll_i2info(inode);
1634 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1635 struct ccc_grouplock grouplock;
1640 CWARN("group id for group lock must not be 0\n");
1644 if (ll_file_nolock(file))
1645 RETURN(-EOPNOTSUPP);
1647 spin_lock(&lli->lli_lock);
1648 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1649 CWARN("group lock already existed with gid %lu\n",
1650 fd->fd_grouplock.cg_gid);
1651 spin_unlock(&lli->lli_lock);
1654 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1655 spin_unlock(&lli->lli_lock);
1657 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1658 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1662 spin_lock(&lli->lli_lock);
1663 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1664 spin_unlock(&lli->lli_lock);
1665 CERROR("another thread just won the race\n");
1666 cl_put_grouplock(&grouplock);
1670 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1671 fd->fd_grouplock = grouplock;
1672 spin_unlock(&lli->lli_lock);
1674 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1678 static int ll_put_grouplock(struct inode *inode, struct file *file,
1681 struct ll_inode_info *lli = ll_i2info(inode);
1682 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1683 struct ccc_grouplock grouplock;
1686 spin_lock(&lli->lli_lock);
1687 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1688 spin_unlock(&lli->lli_lock);
1689 CWARN("no group lock held\n");
1692 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1694 if (fd->fd_grouplock.cg_gid != arg) {
1695 CWARN("group lock %lu doesn't match current id %lu\n",
1696 arg, fd->fd_grouplock.cg_gid);
1697 spin_unlock(&lli->lli_lock);
1701 grouplock = fd->fd_grouplock;
1702 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1703 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1704 spin_unlock(&lli->lli_lock);
1706 cl_put_grouplock(&grouplock);
1707 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1712 * Close inode open handle
1714 * \param dentry [in] dentry which contains the inode
1715 * \param it [in,out] intent which contains open info and result
1718 * \retval <0 failure
1720 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1722 struct inode *inode = dentry->d_inode;
1723 struct obd_client_handle *och;
1729 /* Root ? Do nothing. */
1730 if (dentry->d_inode->i_sb->s_root == dentry)
1733 /* No open handle to close? Move away */
1734 if (!it_disposition(it, DISP_OPEN_OPEN))
1737 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1739 OBD_ALLOC(och, sizeof(*och));
1741 GOTO(out, rc = -ENOMEM);
1743 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1745 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1748 /* this one is in place of ll_file_open */
1749 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1750 ptlrpc_req_finished(it->d.lustre.it_data);
1751 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1757 * Get size for inode for which FIEMAP mapping is requested.
1758 * Make the FIEMAP get_info call and returns the result.
1760 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1763 struct obd_export *exp = ll_i2dtexp(inode);
1764 struct lov_stripe_md *lsm = NULL;
1765 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1766 __u32 vallen = num_bytes;
1770 /* Checks for fiemap flags */
1771 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1772 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1776 /* Check for FIEMAP_FLAG_SYNC */
1777 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1778 rc = filemap_fdatawrite(inode->i_mapping);
1783 lsm = ccc_inode_lsm_get(inode);
1787 /* If the stripe_count > 1 and the application does not understand
1788 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1790 if (lsm->lsm_stripe_count > 1 &&
1791 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1792 GOTO(out, rc = -EOPNOTSUPP);
1794 fm_key.oa.o_oi = lsm->lsm_oi;
1795 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1797 if (i_size_read(inode) == 0) {
1798 rc = ll_glimpse_size(inode);
1803 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1804 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1805 /* If filesize is 0, then there would be no objects for mapping */
1806 if (fm_key.oa.o_size == 0) {
1807 fiemap->fm_mapped_extents = 0;
1811 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1813 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1816 CERROR("obd_get_info failed: rc = %d\n", rc);
1819 ccc_inode_lsm_put(inode, lsm);
1823 int ll_fid2path(struct inode *inode, void __user *arg)
1825 struct obd_export *exp = ll_i2mdexp(inode);
1826 const struct getinfo_fid2path __user *gfin = arg;
1828 struct getinfo_fid2path *gfout;
1834 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1835 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1838 /* Only need to get the buflen */
1839 if (get_user(pathlen, &gfin->gf_pathlen))
1842 if (pathlen > PATH_MAX)
1845 outsize = sizeof(*gfout) + pathlen;
1846 OBD_ALLOC(gfout, outsize);
1850 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1851 GOTO(gf_free, rc = -EFAULT);
1853 /* Call mdc_iocontrol */
1854 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1858 if (copy_to_user(arg, gfout, outsize))
1862 OBD_FREE(gfout, outsize);
1866 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1868 struct ll_user_fiemap *fiemap_s;
1869 size_t num_bytes, ret_bytes;
1870 unsigned int extent_count;
1873 /* Get the extent count so we can calculate the size of
1874 * required fiemap buffer */
1875 if (get_user(extent_count,
1876 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1880 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1882 num_bytes = sizeof(*fiemap_s) + (extent_count *
1883 sizeof(struct ll_fiemap_extent));
1885 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1886 if (fiemap_s == NULL)
1889 /* get the fiemap value */
1890 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1892 GOTO(error, rc = -EFAULT);
1894 /* If fm_extent_count is non-zero, read the first extent since
1895 * it is used to calculate end_offset and device from previous
1898 if (copy_from_user(&fiemap_s->fm_extents[0],
1899 (char __user *)arg + sizeof(*fiemap_s),
1900 sizeof(struct ll_fiemap_extent)))
1901 GOTO(error, rc = -EFAULT);
1904 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1908 ret_bytes = sizeof(struct ll_user_fiemap);
1910 if (extent_count != 0)
1911 ret_bytes += (fiemap_s->fm_mapped_extents *
1912 sizeof(struct ll_fiemap_extent));
1914 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1918 OBD_FREE_LARGE(fiemap_s, num_bytes);
1923 * Read the data_version for inode.
1925 * This value is computed using stripe object version on OST.
1926 * Version is computed using server side locking.
1928 * @param sync if do sync on the OST side;
1930 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1931 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1933 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1935 struct lov_stripe_md *lsm = NULL;
1936 struct ll_sb_info *sbi = ll_i2sbi(inode);
1937 struct obdo *obdo = NULL;
1941 /* If no stripe, we consider version is 0. */
1942 lsm = ccc_inode_lsm_get(inode);
1943 if (!lsm_has_objects(lsm)) {
1945 CDEBUG(D_INODE, "No object for inode\n");
1949 OBD_ALLOC_PTR(obdo);
1951 GOTO(out, rc = -ENOMEM);
1953 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1955 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1958 *data_version = obdo->o_data_version;
1964 ccc_inode_lsm_put(inode, lsm);
1969 * Trigger a HSM release request for the provided inode.
1971 int ll_hsm_release(struct inode *inode)
1973 struct cl_env_nest nest;
1975 struct obd_client_handle *och = NULL;
1976 __u64 data_version = 0;
1980 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1981 ll_get_fsname(inode->i_sb, NULL, 0),
1982 PFID(&ll_i2info(inode)->lli_fid));
1984 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1986 GOTO(out, rc = PTR_ERR(och));
1988 /* Grab latest data_version and [am]time values */
1989 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1993 env = cl_env_nested_get(&nest);
1995 GOTO(out, rc = PTR_ERR(env));
1997 ll_merge_attr(env, inode);
1998 cl_env_nested_put(&nest, env);
2000 /* Release the file.
2001 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2002 * we still need it to pack l_remote_handle to MDT. */
2003 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2009 if (och != NULL && !IS_ERR(och)) /* close the file */
2010 ll_lease_close(och, inode, NULL);
2015 struct ll_swap_stack {
2016 struct iattr ia1, ia2;
2018 struct inode *inode1, *inode2;
2019 bool check_dv1, check_dv2;
2022 static int ll_swap_layouts(struct file *file1, struct file *file2,
2023 struct lustre_swap_layouts *lsl)
2025 struct mdc_swap_layouts msl;
2026 struct md_op_data *op_data;
2029 struct ll_swap_stack *llss = NULL;
2032 OBD_ALLOC_PTR(llss);
2036 llss->inode1 = file1->f_dentry->d_inode;
2037 llss->inode2 = file2->f_dentry->d_inode;
2039 if (!S_ISREG(llss->inode2->i_mode))
2040 GOTO(free, rc = -EINVAL);
2042 if (inode_permission(llss->inode1, MAY_WRITE) ||
2043 inode_permission(llss->inode2, MAY_WRITE))
2044 GOTO(free, rc = -EPERM);
2046 if (llss->inode2->i_sb != llss->inode1->i_sb)
2047 GOTO(free, rc = -EXDEV);
2049 /* we use 2 bool because it is easier to swap than 2 bits */
2050 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2051 llss->check_dv1 = true;
2053 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2054 llss->check_dv2 = true;
2056 /* we cannot use lsl->sl_dvX directly because we may swap them */
2057 llss->dv1 = lsl->sl_dv1;
2058 llss->dv2 = lsl->sl_dv2;
2060 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2061 if (rc == 0) /* same file, done! */
2064 if (rc < 0) { /* sequentialize it */
2065 swap(llss->inode1, llss->inode2);
2067 swap(llss->dv1, llss->dv2);
2068 swap(llss->check_dv1, llss->check_dv2);
2072 if (gid != 0) { /* application asks to flush dirty cache */
2073 rc = ll_get_grouplock(llss->inode1, file1, gid);
2077 rc = ll_get_grouplock(llss->inode2, file2, gid);
2079 ll_put_grouplock(llss->inode1, file1, gid);
2084 /* to be able to restore mtime and atime after swap
2085 * we need to first save them */
2087 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2088 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2089 llss->ia1.ia_atime = llss->inode1->i_atime;
2090 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2091 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2092 llss->ia2.ia_atime = llss->inode2->i_atime;
2093 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2096 /* ultimate check, before swaping the layouts we check if
2097 * dataversion has changed (if requested) */
2098 if (llss->check_dv1) {
2099 rc = ll_data_version(llss->inode1, &dv, 0);
2102 if (dv != llss->dv1)
2103 GOTO(putgl, rc = -EAGAIN);
2106 if (llss->check_dv2) {
2107 rc = ll_data_version(llss->inode2, &dv, 0);
2110 if (dv != llss->dv2)
2111 GOTO(putgl, rc = -EAGAIN);
2114 /* struct md_op_data is used to send the swap args to the mdt
2115 * only flags is missing, so we use struct mdc_swap_layouts
2116 * through the md_op_data->op_data */
2117 /* flags from user space have to be converted before they are send to
2118 * server, no flag is sent today, they are only used on the client */
2121 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2122 0, LUSTRE_OPC_ANY, &msl);
2123 if (IS_ERR(op_data))
2124 GOTO(free, rc = PTR_ERR(op_data));
2126 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2127 sizeof(*op_data), op_data, NULL);
2128 ll_finish_md_op_data(op_data);
2132 ll_put_grouplock(llss->inode2, file2, gid);
2133 ll_put_grouplock(llss->inode1, file1, gid);
2136 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2140 /* clear useless flags */
2141 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2142 llss->ia1.ia_valid &= ~ATTR_MTIME;
2143 llss->ia2.ia_valid &= ~ATTR_MTIME;
2146 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2147 llss->ia1.ia_valid &= ~ATTR_ATIME;
2148 llss->ia2.ia_valid &= ~ATTR_ATIME;
2151 /* update time if requested */
2153 if (llss->ia2.ia_valid != 0) {
2154 mutex_lock(&llss->inode1->i_mutex);
2155 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2156 mutex_unlock(&llss->inode1->i_mutex);
2159 if (llss->ia1.ia_valid != 0) {
2162 mutex_lock(&llss->inode2->i_mutex);
2163 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2164 mutex_unlock(&llss->inode2->i_mutex);
2176 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2178 struct md_op_data *op_data;
2181 /* Non-root users are forbidden to set or clear flags which are
2182 * NOT defined in HSM_USER_MASK. */
2183 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2184 !cfs_capable(CFS_CAP_SYS_ADMIN))
2187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2188 LUSTRE_OPC_ANY, hss);
2189 if (IS_ERR(op_data))
2190 RETURN(PTR_ERR(op_data));
2192 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2193 sizeof(*op_data), op_data, NULL);
2195 ll_finish_md_op_data(op_data);
2200 static int ll_hsm_import(struct inode *inode, struct file *file,
2201 struct hsm_user_import *hui)
2203 struct hsm_state_set *hss = NULL;
2204 struct iattr *attr = NULL;
2208 if (!S_ISREG(inode->i_mode))
2214 GOTO(out, rc = -ENOMEM);
2216 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2217 hss->hss_archive_id = hui->hui_archive_id;
2218 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2219 rc = ll_hsm_state_set(inode, hss);
2223 OBD_ALLOC_PTR(attr);
2225 GOTO(out, rc = -ENOMEM);
2227 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2228 attr->ia_mode |= S_IFREG;
2229 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2230 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2231 attr->ia_size = hui->hui_size;
2232 attr->ia_mtime.tv_sec = hui->hui_mtime;
2233 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2234 attr->ia_atime.tv_sec = hui->hui_atime;
2235 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2237 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2238 ATTR_UID | ATTR_GID |
2239 ATTR_MTIME | ATTR_MTIME_SET |
2240 ATTR_ATIME | ATTR_ATIME_SET;
2242 mutex_lock(&inode->i_mutex);
2244 rc = ll_setattr_raw(file->f_dentry, attr, true);
2248 mutex_unlock(&inode->i_mutex);
2260 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2262 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2263 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2267 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2269 struct inode *inode = file->f_dentry->d_inode;
2270 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2274 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2275 PFID(ll_inode2fid(inode)), inode, cmd);
2276 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2278 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2279 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2283 case LL_IOC_GETFLAGS:
2284 /* Get the current value of the file flags */
2285 return put_user(fd->fd_flags, (int __user *)arg);
2286 case LL_IOC_SETFLAGS:
2287 case LL_IOC_CLRFLAGS:
2288 /* Set or clear specific file flags */
2289 /* XXX This probably needs checks to ensure the flags are
2290 * not abused, and to handle any flag side effects.
2292 if (get_user(flags, (int __user *) arg))
2295 if (cmd == LL_IOC_SETFLAGS) {
2296 if ((flags & LL_FILE_IGNORE_LOCK) &&
2297 !(file->f_flags & O_DIRECT)) {
2298 CERROR("%s: unable to disable locking on "
2299 "non-O_DIRECT file\n", current->comm);
2303 fd->fd_flags |= flags;
2305 fd->fd_flags &= ~flags;
2308 case LL_IOC_LOV_SETSTRIPE:
2309 RETURN(ll_lov_setstripe(inode, file, arg));
2310 case LL_IOC_LOV_SETEA:
2311 RETURN(ll_lov_setea(inode, file, arg));
2312 case LL_IOC_LOV_SWAP_LAYOUTS: {
2314 struct lustre_swap_layouts lsl;
2316 if (copy_from_user(&lsl, (char __user *)arg,
2317 sizeof(struct lustre_swap_layouts)))
2320 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2323 file2 = fget(lsl.sl_fd);
2328 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2329 rc = ll_swap_layouts(file, file2, &lsl);
2333 case LL_IOC_LOV_GETSTRIPE:
2334 RETURN(ll_file_getstripe(inode,
2335 (struct lov_user_md __user *)arg));
2336 case FSFILT_IOC_FIEMAP:
2337 RETURN(ll_ioctl_fiemap(inode, arg));
2338 case FSFILT_IOC_GETFLAGS:
2339 case FSFILT_IOC_SETFLAGS:
2340 RETURN(ll_iocontrol(inode, file, cmd, arg));
2341 case FSFILT_IOC_GETVERSION_OLD:
2342 case FSFILT_IOC_GETVERSION:
2343 RETURN(put_user(inode->i_generation, (int __user *)arg));
2344 case LL_IOC_GROUP_LOCK:
2345 RETURN(ll_get_grouplock(inode, file, arg));
2346 case LL_IOC_GROUP_UNLOCK:
2347 RETURN(ll_put_grouplock(inode, file, arg));
2348 case IOC_OBD_STATFS:
2349 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2351 /* We need to special case any other ioctls we want to handle,
2352 * to send them to the MDS/OST as appropriate and to properly
2353 * network encode the arg field.
2354 case FSFILT_IOC_SETVERSION_OLD:
2355 case FSFILT_IOC_SETVERSION:
2357 case LL_IOC_FLUSHCTX:
2358 RETURN(ll_flush_ctx(inode));
2359 case LL_IOC_PATH2FID: {
2360 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2361 sizeof(struct lu_fid)))
2366 case LL_IOC_GETPARENT:
2367 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2369 case OBD_IOC_FID2PATH:
2370 RETURN(ll_fid2path(inode, (void __user *)arg));
2371 case LL_IOC_DATA_VERSION: {
2372 struct ioc_data_version idv;
2375 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2378 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2379 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2382 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2388 case LL_IOC_GET_MDTIDX: {
2391 mdtidx = ll_get_mdt_idx(inode);
2395 if (put_user((int)mdtidx, (int __user *)arg))
2400 case OBD_IOC_GETDTNAME:
2401 case OBD_IOC_GETMDNAME:
2402 RETURN(ll_get_obd_name(inode, cmd, arg));
2403 case LL_IOC_HSM_STATE_GET: {
2404 struct md_op_data *op_data;
2405 struct hsm_user_state *hus;
2412 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2413 LUSTRE_OPC_ANY, hus);
2414 if (IS_ERR(op_data)) {
2416 RETURN(PTR_ERR(op_data));
2419 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2422 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2425 ll_finish_md_op_data(op_data);
2429 case LL_IOC_HSM_STATE_SET: {
2430 struct hsm_state_set *hss;
2437 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2442 rc = ll_hsm_state_set(inode, hss);
2447 case LL_IOC_HSM_ACTION: {
2448 struct md_op_data *op_data;
2449 struct hsm_current_action *hca;
2456 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2457 LUSTRE_OPC_ANY, hca);
2458 if (IS_ERR(op_data)) {
2460 RETURN(PTR_ERR(op_data));
2463 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2466 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2469 ll_finish_md_op_data(op_data);
2473 case LL_IOC_SET_LEASE: {
2474 struct ll_inode_info *lli = ll_i2info(inode);
2475 struct obd_client_handle *och = NULL;
2480 case LL_LEASE_WRLCK:
2481 if (!(file->f_mode & FMODE_WRITE))
2483 fmode = FMODE_WRITE;
2485 case LL_LEASE_RDLCK:
2486 if (!(file->f_mode & FMODE_READ))
2490 case LL_LEASE_UNLCK:
2491 mutex_lock(&lli->lli_och_mutex);
2492 if (fd->fd_lease_och != NULL) {
2493 och = fd->fd_lease_och;
2494 fd->fd_lease_och = NULL;
2496 mutex_unlock(&lli->lli_och_mutex);
2501 fmode = och->och_flags;
2502 rc = ll_lease_close(och, inode, &lease_broken);
2509 RETURN(ll_lease_type_from_fmode(fmode));
2514 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2516 /* apply for lease */
2517 och = ll_lease_open(inode, file, fmode, 0);
2519 RETURN(PTR_ERR(och));
2522 mutex_lock(&lli->lli_och_mutex);
2523 if (fd->fd_lease_och == NULL) {
2524 fd->fd_lease_och = och;
2527 mutex_unlock(&lli->lli_och_mutex);
2529 /* impossible now that only excl is supported for now */
2530 ll_lease_close(och, inode, &lease_broken);
2535 case LL_IOC_GET_LEASE: {
2536 struct ll_inode_info *lli = ll_i2info(inode);
2537 struct ldlm_lock *lock = NULL;
2540 mutex_lock(&lli->lli_och_mutex);
2541 if (fd->fd_lease_och != NULL) {
2542 struct obd_client_handle *och = fd->fd_lease_och;
2544 lock = ldlm_handle2lock(&och->och_lease_handle);
2546 lock_res_and_lock(lock);
2547 if (!ldlm_is_cancel(lock))
2548 fmode = och->och_flags;
2550 unlock_res_and_lock(lock);
2551 LDLM_LOCK_PUT(lock);
2554 mutex_unlock(&lli->lli_och_mutex);
2556 RETURN(ll_lease_type_from_fmode(fmode));
2558 case LL_IOC_HSM_IMPORT: {
2559 struct hsm_user_import *hui;
2565 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2570 rc = ll_hsm_import(inode, file, hui);
2580 ll_iocontrol_call(inode, file, cmd, arg, &err))
2583 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2584 (void __user *)arg));
2589 #ifndef HAVE_FILE_LLSEEK_SIZE
2590 static inline loff_t
2591 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2593 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2595 if (offset > maxsize)
2598 if (offset != file->f_pos) {
2599 file->f_pos = offset;
2600 file->f_version = 0;
2606 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2607 loff_t maxsize, loff_t eof)
2609 struct inode *inode = file->f_dentry->d_inode;
2617 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2618 * position-querying operation. Avoid rewriting the "same"
2619 * f_pos value back to the file because a concurrent read(),
2620 * write() or lseek() might have altered it
2625 * f_lock protects against read/modify/write race with other
2626 * SEEK_CURs. Note that parallel writes and reads behave
2629 mutex_lock(&inode->i_mutex);
2630 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2631 mutex_unlock(&inode->i_mutex);
2635 * In the generic case the entire file is data, so as long as
2636 * offset isn't at the end of the file then the offset is data.
2643 * There is a virtual hole at the end of the file, so as long as
2644 * offset isn't i_size or larger, return i_size.
2652 return llseek_execute(file, offset, maxsize);
2656 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2658 struct inode *inode = file->f_dentry->d_inode;
2659 loff_t retval, eof = 0;
2662 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2663 (origin == SEEK_CUR) ? file->f_pos : 0);
2664 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2665 PFID(ll_inode2fid(inode)), inode, retval, retval,
2667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2669 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2670 retval = ll_glimpse_size(inode);
2673 eof = i_size_read(inode);
2676 retval = ll_generic_file_llseek_size(file, offset, origin,
2677 ll_file_maxbytes(inode), eof);
2681 static int ll_flush(struct file *file, fl_owner_t id)
2683 struct inode *inode = file->f_dentry->d_inode;
2684 struct ll_inode_info *lli = ll_i2info(inode);
2685 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2688 LASSERT(!S_ISDIR(inode->i_mode));
2690 /* catch async errors that were recorded back when async writeback
2691 * failed for pages in this mapping. */
2692 rc = lli->lli_async_rc;
2693 lli->lli_async_rc = 0;
2694 if (lli->lli_clob != NULL) {
2695 err = lov_read_and_clear_async_rc(lli->lli_clob);
2700 /* The application has been told write failure already.
2701 * Do not report failure again. */
2702 if (fd->fd_write_failed)
2704 return rc ? -EIO : 0;
2708 * Called to make sure a portion of file has been written out.
2709 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2711 * Return how many pages have been written.
2713 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2714 enum cl_fsync_mode mode, int ignore_layout)
2716 struct cl_env_nest nest;
2719 struct obd_capa *capa = NULL;
2720 struct cl_fsync_io *fio;
2724 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2725 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2728 env = cl_env_nested_get(&nest);
2730 RETURN(PTR_ERR(env));
2732 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2734 io = ccc_env_thread_io(env);
2735 io->ci_obj = ll_i2info(inode)->lli_clob;
2736 io->ci_ignore_layout = ignore_layout;
2738 /* initialize parameters for sync */
2739 fio = &io->u.ci_fsync;
2740 fio->fi_capa = capa;
2741 fio->fi_start = start;
2743 fio->fi_fid = ll_inode2fid(inode);
2744 fio->fi_mode = mode;
2745 fio->fi_nr_written = 0;
2747 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2748 result = cl_io_loop(env, io);
2750 result = io->ci_result;
2752 result = fio->fi_nr_written;
2753 cl_io_fini(env, io);
2754 cl_env_nested_put(&nest, env);
2762 * When dentry is provided (the 'else' case), *file->f_dentry may be
2763 * null and dentry must be used directly rather than pulled from
2764 * *file->f_dentry as is done otherwise.
2767 #ifdef HAVE_FILE_FSYNC_4ARGS
2768 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2770 struct dentry *dentry = file->f_dentry;
2771 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2772 int ll_fsync(struct file *file, int datasync)
2774 struct dentry *dentry = file->f_dentry;
2776 loff_t end = LLONG_MAX;
2778 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2781 loff_t end = LLONG_MAX;
2783 struct inode *inode = dentry->d_inode;
2784 struct ll_inode_info *lli = ll_i2info(inode);
2785 struct ptlrpc_request *req;
2786 struct obd_capa *oc;
2790 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2791 PFID(ll_inode2fid(inode)), inode);
2792 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2794 #ifdef HAVE_FILE_FSYNC_4ARGS
2795 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2796 mutex_lock(&inode->i_mutex);
2798 /* fsync's caller has already called _fdata{sync,write}, we want
2799 * that IO to finish before calling the osc and mdc sync methods */
2800 rc = filemap_fdatawait(inode->i_mapping);
2803 /* catch async errors that were recorded back when async writeback
2804 * failed for pages in this mapping. */
2805 if (!S_ISDIR(inode->i_mode)) {
2806 err = lli->lli_async_rc;
2807 lli->lli_async_rc = 0;
2810 err = lov_read_and_clear_async_rc(lli->lli_clob);
2815 oc = ll_mdscapa_get(inode);
2816 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2822 ptlrpc_req_finished(req);
2824 if (S_ISREG(inode->i_mode)) {
2825 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2827 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2828 if (rc == 0 && err < 0)
2831 fd->fd_write_failed = true;
2833 fd->fd_write_failed = false;
2836 #ifdef HAVE_FILE_FSYNC_4ARGS
2837 mutex_unlock(&inode->i_mutex);
2843 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2845 struct inode *inode = file->f_dentry->d_inode;
2846 struct ll_sb_info *sbi = ll_i2sbi(inode);
2847 struct ldlm_enqueue_info einfo = {
2848 .ei_type = LDLM_FLOCK,
2849 .ei_cb_cp = ldlm_flock_completion_ast,
2850 .ei_cbdata = file_lock,
2852 struct md_op_data *op_data;
2853 struct lustre_handle lockh = {0};
2854 ldlm_policy_data_t flock = {{0}};
2855 int fl_type = file_lock->fl_type;
2861 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2862 PFID(ll_inode2fid(inode)), file_lock);
2864 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2866 if (file_lock->fl_flags & FL_FLOCK) {
2867 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2868 /* flocks are whole-file locks */
2869 flock.l_flock.end = OFFSET_MAX;
2870 /* For flocks owner is determined by the local file desctiptor*/
2871 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2872 } else if (file_lock->fl_flags & FL_POSIX) {
2873 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2874 flock.l_flock.start = file_lock->fl_start;
2875 flock.l_flock.end = file_lock->fl_end;
2879 flock.l_flock.pid = file_lock->fl_pid;
2881 /* Somewhat ugly workaround for svc lockd.
2882 * lockd installs custom fl_lmops->lm_compare_owner that checks
2883 * for the fl_owner to be the same (which it always is on local node
2884 * I guess between lockd processes) and then compares pid.
2885 * As such we assign pid to the owner field to make it all work,
2886 * conflict with normal locks is unlikely since pid space and
2887 * pointer space for current->files are not intersecting */
2888 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2889 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2893 einfo.ei_mode = LCK_PR;
2896 /* An unlock request may or may not have any relation to
2897 * existing locks so we may not be able to pass a lock handle
2898 * via a normal ldlm_lock_cancel() request. The request may even
2899 * unlock a byte range in the middle of an existing lock. In
2900 * order to process an unlock request we need all of the same
2901 * information that is given with a normal read or write record
2902 * lock request. To avoid creating another ldlm unlock (cancel)
2903 * message we'll treat a LCK_NL flock request as an unlock. */
2904 einfo.ei_mode = LCK_NL;
2907 einfo.ei_mode = LCK_PW;
2910 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2925 flags = LDLM_FL_BLOCK_NOWAIT;
2931 flags = LDLM_FL_TEST_LOCK;
2934 CERROR("unknown fcntl lock command: %d\n", cmd);
2938 /* Save the old mode so that if the mode in the lock changes we
2939 * can decrement the appropriate reader or writer refcount. */
2940 file_lock->fl_type = einfo.ei_mode;
2942 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2943 LUSTRE_OPC_ANY, NULL);
2944 if (IS_ERR(op_data))
2945 RETURN(PTR_ERR(op_data));
2947 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2948 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2949 flock.l_flock.pid, flags, einfo.ei_mode,
2950 flock.l_flock.start, flock.l_flock.end);
2952 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2955 /* Restore the file lock type if not TEST lock. */
2956 if (!(flags & LDLM_FL_TEST_LOCK))
2957 file_lock->fl_type = fl_type;
2959 if ((file_lock->fl_flags & FL_FLOCK) &&
2960 (rc == 0 || file_lock->fl_type == F_UNLCK))
2961 rc2 = flock_lock_file_wait(file, file_lock);
2962 if ((file_lock->fl_flags & FL_POSIX) &&
2963 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2964 !(flags & LDLM_FL_TEST_LOCK))
2965 rc2 = posix_lock_file_wait(file, file_lock);
2967 if (rc2 && file_lock->fl_type != F_UNLCK) {
2968 einfo.ei_mode = LCK_NL;
2969 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2974 ll_finish_md_op_data(op_data);
2979 int ll_get_fid_by_name(struct inode *parent, const char *name,
2980 int namelen, struct lu_fid *fid)
2982 struct md_op_data *op_data = NULL;
2983 struct mdt_body *body;
2984 struct ptlrpc_request *req;
2988 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2989 LUSTRE_OPC_ANY, NULL);
2990 if (IS_ERR(op_data))
2991 RETURN(PTR_ERR(op_data));
2993 op_data->op_valid = OBD_MD_FLID;
2994 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2995 ll_finish_md_op_data(op_data);
2999 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3001 GOTO(out_req, rc = -EFAULT);
3003 *fid = body->mbo_fid1;
3005 ptlrpc_req_finished(req);
3009 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3010 const char *name, int namelen)
3012 struct dentry *dchild = NULL;
3013 struct inode *child_inode = NULL;
3014 struct md_op_data *op_data;
3015 struct ptlrpc_request *request = NULL;
3020 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3021 name, PFID(ll_inode2fid(parent)), mdtidx);
3023 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3024 0, LUSTRE_OPC_ANY, NULL);
3025 if (IS_ERR(op_data))
3026 RETURN(PTR_ERR(op_data));
3028 /* Get child FID first */
3029 qstr.hash = full_name_hash(name, namelen);
3032 dchild = d_lookup(file->f_dentry, &qstr);
3033 if (dchild != NULL && dchild->d_inode != NULL) {
3034 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3035 if (dchild->d_inode != NULL) {
3036 child_inode = igrab(dchild->d_inode);
3037 ll_invalidate_aliases(child_inode);
3041 rc = ll_get_fid_by_name(parent, name, namelen,
3047 if (!fid_is_sane(&op_data->op_fid3)) {
3048 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3049 ll_get_fsname(parent->i_sb, NULL, 0), name,
3050 PFID(&op_data->op_fid3));
3051 GOTO(out_free, rc = -EINVAL);
3054 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3059 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3060 PFID(&op_data->op_fid3), mdtidx);
3061 GOTO(out_free, rc = 0);
3064 op_data->op_mds = mdtidx;
3065 op_data->op_cli_flags = CLI_MIGRATE;
3066 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3067 namelen, name, namelen, &request);
3069 ll_update_times(request, parent);
3071 ptlrpc_req_finished(request);
3076 if (child_inode != NULL) {
3077 clear_nlink(child_inode);
3081 ll_finish_md_op_data(op_data);
3086 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3094 * test if some locks matching bits and l_req_mode are acquired
3095 * - bits can be in different locks
3096 * - if found clear the common lock bits in *bits
3097 * - the bits not found, are kept in *bits
3099 * \param bits [IN] searched lock bits [IN]
3100 * \param l_req_mode [IN] searched lock mode
3101 * \retval boolean, true iff all bits are found
3103 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3105 struct lustre_handle lockh;
3106 ldlm_policy_data_t policy;
3107 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3108 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3117 fid = &ll_i2info(inode)->lli_fid;
3118 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3119 ldlm_lockname[mode]);
3121 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3122 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3123 policy.l_inodebits.bits = *bits & (1 << i);
3124 if (policy.l_inodebits.bits == 0)
3127 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3128 &policy, mode, &lockh)) {
3129 struct ldlm_lock *lock;
3131 lock = ldlm_handle2lock(&lockh);
3134 ~(lock->l_policy_data.l_inodebits.bits);
3135 LDLM_LOCK_PUT(lock);
3137 *bits &= ~policy.l_inodebits.bits;
3144 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3145 struct lustre_handle *lockh, __u64 flags,
3148 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3153 fid = &ll_i2info(inode)->lli_fid;
3154 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3156 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3157 fid, LDLM_IBITS, &policy, mode, lockh);
3162 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3164 /* Already unlinked. Just update nlink and return success */
3165 if (rc == -ENOENT) {
3167 /* This path cannot be hit for regular files unless in
3168 * case of obscure races, so no need to to validate
3170 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3172 } else if (rc != 0) {
3173 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3174 "%s: revalidate FID "DFID" error: rc = %d\n",
3175 ll_get_fsname(inode->i_sb, NULL, 0),
3176 PFID(ll_inode2fid(inode)), rc);
3182 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3184 struct inode *inode = dentry->d_inode;
3185 struct ptlrpc_request *req = NULL;
3186 struct obd_export *exp;
3190 LASSERT(inode != NULL);
3192 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3193 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3195 exp = ll_i2mdexp(inode);
3197 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3198 * But under CMD case, it caused some lock issues, should be fixed
3199 * with new CMD ibits lock. See bug 12718 */
3200 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3201 struct lookup_intent oit = { .it_op = IT_GETATTR };
3202 struct md_op_data *op_data;
3204 if (ibits == MDS_INODELOCK_LOOKUP)
3205 oit.it_op = IT_LOOKUP;
3207 /* Call getattr by fid, so do not provide name at all. */
3208 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3209 dentry->d_inode, NULL, 0, 0,
3210 LUSTRE_OPC_ANY, NULL);
3211 if (IS_ERR(op_data))
3212 RETURN(PTR_ERR(op_data));
3214 rc = md_intent_lock(exp, op_data, &oit, &req,
3215 &ll_md_blocking_ast, 0);
3216 ll_finish_md_op_data(op_data);
3218 rc = ll_inode_revalidate_fini(inode, rc);
3222 rc = ll_revalidate_it_finish(req, &oit, dentry);
3224 ll_intent_release(&oit);
3228 /* Unlinked? Unhash dentry, so it is not picked up later by
3229 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3230 here to preserve get_cwd functionality on 2.6.
3232 if (!dentry->d_inode->i_nlink)
3233 d_lustre_invalidate(dentry, 0);
3235 ll_lookup_finish_locks(&oit, dentry);
3236 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3237 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3238 u64 valid = OBD_MD_FLGETATTR;
3239 struct md_op_data *op_data;
3242 if (S_ISREG(inode->i_mode)) {
3243 rc = ll_get_default_mdsize(sbi, &ealen);
3246 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3249 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3250 0, ealen, LUSTRE_OPC_ANY,
3252 if (IS_ERR(op_data))
3253 RETURN(PTR_ERR(op_data));
3255 op_data->op_valid = valid;
3256 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3257 * capa for this inode. Because we only keep capas of dirs
3259 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3260 ll_finish_md_op_data(op_data);
3262 rc = ll_inode_revalidate_fini(inode, rc);
3266 rc = ll_prep_inode(&inode, req, NULL, NULL);
3269 ptlrpc_req_finished(req);
3273 static int ll_merge_md_attr(struct inode *inode)
3275 struct cl_attr attr = { 0 };
3278 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3279 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3284 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3285 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3287 ll_i2info(inode)->lli_atime = attr.cat_atime;
3288 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3289 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3295 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3297 struct inode *inode = dentry->d_inode;
3301 rc = __ll_inode_revalidate(dentry, ibits);
3305 /* if object isn't regular file, don't validate size */
3306 if (!S_ISREG(inode->i_mode)) {
3307 if (S_ISDIR(inode->i_mode) &&
3308 ll_i2info(inode)->lli_lsm_md != NULL) {
3309 rc = ll_merge_md_attr(inode);
3314 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3315 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3316 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3318 /* In case of restore, the MDT has the right size and has
3319 * already send it back without granting the layout lock,
3320 * inode is up-to-date so glimpse is useless.
3321 * Also to glimpse we need the layout, in case of a running
3322 * restore the MDT holds the layout lock so the glimpse will
3323 * block up to the end of restore (getattr will block)
3325 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3326 rc = ll_glimpse_size(inode);
3331 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3333 struct inode *inode = de->d_inode;
3334 struct ll_sb_info *sbi = ll_i2sbi(inode);
3335 struct ll_inode_info *lli = ll_i2info(inode);
3338 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3339 MDS_INODELOCK_LOOKUP);
3340 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3345 stat->dev = inode->i_sb->s_dev;
3346 if (ll_need_32bit_api(sbi))
3347 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3349 stat->ino = inode->i_ino;
3350 stat->mode = inode->i_mode;
3351 stat->uid = inode->i_uid;
3352 stat->gid = inode->i_gid;
3353 stat->rdev = inode->i_rdev;
3354 stat->atime = inode->i_atime;
3355 stat->mtime = inode->i_mtime;
3356 stat->ctime = inode->i_ctime;
3357 stat->blksize = 1 << inode->i_blkbits;
3358 stat->blocks = inode->i_blocks;
3360 if (S_ISDIR(inode->i_mode) &&
3361 ll_i2info(inode)->lli_lsm_md != NULL) {
3362 stat->nlink = lli->lli_stripe_dir_nlink;
3363 stat->size = lli->lli_stripe_dir_size;
3365 stat->nlink = inode->i_nlink;
3366 stat->size = i_size_read(inode);
3372 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3373 __u64 start, __u64 len)
3377 struct ll_user_fiemap *fiemap;
3378 unsigned int extent_count = fieinfo->fi_extents_max;
3380 num_bytes = sizeof(*fiemap) + (extent_count *
3381 sizeof(struct ll_fiemap_extent));
3382 OBD_ALLOC_LARGE(fiemap, num_bytes);
3387 fiemap->fm_flags = fieinfo->fi_flags;
3388 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3389 fiemap->fm_start = start;
3390 fiemap->fm_length = len;
3391 if (extent_count > 0)
3392 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3393 sizeof(struct ll_fiemap_extent));
3395 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3397 fieinfo->fi_flags = fiemap->fm_flags;
3398 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3399 if (extent_count > 0)
3400 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3401 fiemap->fm_mapped_extents *
3402 sizeof(struct ll_fiemap_extent));
3404 OBD_FREE_LARGE(fiemap, num_bytes);
3408 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3410 struct ll_inode_info *lli = ll_i2info(inode);
3411 struct posix_acl *acl = NULL;
3414 spin_lock(&lli->lli_lock);
3415 /* VFS' acl_permission_check->check_acl will release the refcount */
3416 acl = posix_acl_dup(lli->lli_posix_acl);
3417 spin_unlock(&lli->lli_lock);
3422 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3424 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3425 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3427 ll_check_acl(struct inode *inode, int mask)
3430 # ifdef CONFIG_FS_POSIX_ACL
3431 struct posix_acl *acl;
3435 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3436 if (flags & IPERM_FLAG_RCU)
3439 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3444 rc = posix_acl_permission(inode, acl, mask);
3445 posix_acl_release(acl);
3448 # else /* !CONFIG_FS_POSIX_ACL */
3450 # endif /* CONFIG_FS_POSIX_ACL */
3452 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3454 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3455 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3457 # ifdef HAVE_INODE_PERMISION_2ARGS
3458 int ll_inode_permission(struct inode *inode, int mask)
3460 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3465 struct ll_sb_info *sbi;
3466 struct root_squash_info *squash;
3467 struct cred *cred = NULL;
3468 const struct cred *old_cred = NULL;
3470 bool squash_id = false;
3473 #ifdef MAY_NOT_BLOCK
3474 if (mask & MAY_NOT_BLOCK)
3476 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3477 if (flags & IPERM_FLAG_RCU)
3481 /* as root inode are NOT getting validated in lookup operation,
3482 * need to do it before permission check. */
3484 if (inode == inode->i_sb->s_root->d_inode) {
3485 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3486 MDS_INODELOCK_LOOKUP);
3491 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3492 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3494 /* squash fsuid/fsgid if needed */
3495 sbi = ll_i2sbi(inode);
3496 squash = &sbi->ll_squash;
3497 if (unlikely(squash->rsi_uid != 0 &&
3498 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3499 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3503 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3504 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3505 squash->rsi_uid, squash->rsi_gid);
3507 /* update current process's credentials
3508 * and FS capability */
3509 cred = prepare_creds();
3513 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3514 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3515 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3516 if ((1 << cap) & CFS_CAP_FS_MASK)
3517 cap_lower(cred->cap_effective, cap);
3519 old_cred = override_creds(cred);
3522 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3524 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3525 rc = lustre_check_remote_perm(inode, mask);
3527 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3529 /* restore current process's credentials and FS capability */
3531 revert_creds(old_cred);
3538 /* -o localflock - only provides locally consistent flock locks */
3539 struct file_operations ll_file_operations = {
3540 .read = ll_file_read,
3541 .aio_read = ll_file_aio_read,
3542 .write = ll_file_write,
3543 .aio_write = ll_file_aio_write,
3544 .unlocked_ioctl = ll_file_ioctl,
3545 .open = ll_file_open,
3546 .release = ll_file_release,
3547 .mmap = ll_file_mmap,
3548 .llseek = ll_file_seek,
3549 .splice_read = ll_file_splice_read,
3554 struct file_operations ll_file_operations_flock = {
3555 .read = ll_file_read,
3556 .aio_read = ll_file_aio_read,
3557 .write = ll_file_write,
3558 .aio_write = ll_file_aio_write,
3559 .unlocked_ioctl = ll_file_ioctl,
3560 .open = ll_file_open,
3561 .release = ll_file_release,
3562 .mmap = ll_file_mmap,
3563 .llseek = ll_file_seek,
3564 .splice_read = ll_file_splice_read,
3567 .flock = ll_file_flock,
3568 .lock = ll_file_flock
3571 /* These are for -o noflock - to return ENOSYS on flock calls */
3572 struct file_operations ll_file_operations_noflock = {
3573 .read = ll_file_read,
3574 .aio_read = ll_file_aio_read,
3575 .write = ll_file_write,
3576 .aio_write = ll_file_aio_write,
3577 .unlocked_ioctl = ll_file_ioctl,
3578 .open = ll_file_open,
3579 .release = ll_file_release,
3580 .mmap = ll_file_mmap,
3581 .llseek = ll_file_seek,
3582 .splice_read = ll_file_splice_read,
3585 .flock = ll_file_noflock,
3586 .lock = ll_file_noflock
3589 struct inode_operations ll_file_inode_operations = {
3590 .setattr = ll_setattr,
3591 .getattr = ll_getattr,
3592 .permission = ll_inode_permission,
3593 .setxattr = ll_setxattr,
3594 .getxattr = ll_getxattr,
3595 .listxattr = ll_listxattr,
3596 .removexattr = ll_removexattr,
3597 .fiemap = ll_fiemap,
3598 #ifdef HAVE_IOP_GET_ACL
3599 .get_acl = ll_get_acl,
3603 /* dynamic ioctl number support routins */
3604 static struct llioc_ctl_data {
3605 struct rw_semaphore ioc_sem;
3606 struct list_head ioc_head;
3608 __RWSEM_INITIALIZER(llioc.ioc_sem),
3609 LIST_HEAD_INIT(llioc.ioc_head)
3614 struct list_head iocd_list;
3615 unsigned int iocd_size;
3616 llioc_callback_t iocd_cb;
3617 unsigned int iocd_count;
3618 unsigned int iocd_cmd[0];
3621 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3624 struct llioc_data *in_data = NULL;
3627 if (cb == NULL || cmd == NULL ||
3628 count > LLIOC_MAX_CMD || count < 0)
3631 size = sizeof(*in_data) + count * sizeof(unsigned int);
3632 OBD_ALLOC(in_data, size);
3633 if (in_data == NULL)
3636 memset(in_data, 0, sizeof(*in_data));
3637 in_data->iocd_size = size;
3638 in_data->iocd_cb = cb;
3639 in_data->iocd_count = count;
3640 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3642 down_write(&llioc.ioc_sem);
3643 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3644 up_write(&llioc.ioc_sem);
3649 void ll_iocontrol_unregister(void *magic)
3651 struct llioc_data *tmp;
3656 down_write(&llioc.ioc_sem);
3657 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3659 unsigned int size = tmp->iocd_size;
3661 list_del(&tmp->iocd_list);
3662 up_write(&llioc.ioc_sem);
3664 OBD_FREE(tmp, size);
3668 up_write(&llioc.ioc_sem);
3670 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3673 EXPORT_SYMBOL(ll_iocontrol_register);
3674 EXPORT_SYMBOL(ll_iocontrol_unregister);
3676 static enum llioc_iter
3677 ll_iocontrol_call(struct inode *inode, struct file *file,
3678 unsigned int cmd, unsigned long arg, int *rcp)
3680 enum llioc_iter ret = LLIOC_CONT;
3681 struct llioc_data *data;
3682 int rc = -EINVAL, i;
3684 down_read(&llioc.ioc_sem);
3685 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3686 for (i = 0; i < data->iocd_count; i++) {
3687 if (cmd != data->iocd_cmd[i])
3690 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3694 if (ret == LLIOC_STOP)
3697 up_read(&llioc.ioc_sem);
3704 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3706 struct ll_inode_info *lli = ll_i2info(inode);
3707 struct cl_env_nest nest;
3712 if (lli->lli_clob == NULL)
3715 env = cl_env_nested_get(&nest);
3717 RETURN(PTR_ERR(env));
3719 result = cl_conf_set(env, lli->lli_clob, conf);
3720 cl_env_nested_put(&nest, env);
3722 if (conf->coc_opc == OBJECT_CONF_SET) {
3723 struct ldlm_lock *lock = conf->coc_lock;
3725 LASSERT(lock != NULL);
3726 LASSERT(ldlm_has_layout(lock));
3728 struct lustre_md *md = conf->u.coc_md;
3729 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3731 /* it can only be allowed to match after layout is
3732 * applied to inode otherwise false layout would be
3733 * seen. Applying layout shoud happen before dropping
3734 * the intent lock. */
3735 ldlm_lock_allow_match(lock);
3737 lli->lli_has_smd = lsm_has_objects(md->lsm);
3738 if (md->lsm != NULL)
3739 gen = md->lsm->lsm_layout_gen;
3742 DFID ": layout version change: %u -> %u\n",
3743 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3745 ll_layout_version_set(lli, gen);
3751 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3752 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3755 struct ll_sb_info *sbi = ll_i2sbi(inode);
3756 struct obd_capa *oc;
3757 struct ptlrpc_request *req;
3758 struct mdt_body *body;
3765 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3766 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3767 lock->l_lvb_data, lock->l_lvb_len);
3769 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3772 /* if layout lock was granted right away, the layout is returned
3773 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3774 * blocked and then granted via completion ast, we have to fetch
3775 * layout here. Please note that we can't use the LVB buffer in
3776 * completion AST because it doesn't have a large enough buffer */
3777 oc = ll_mdscapa_get(inode);
3778 rc = ll_get_default_mdsize(sbi, &lmmsize);
3780 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3781 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3787 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3789 GOTO(out, rc = -EPROTO);
3791 lmmsize = body->mbo_eadatasize;
3792 if (lmmsize == 0) /* empty layout */
3795 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3797 GOTO(out, rc = -EFAULT);
3799 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3800 if (lvbdata == NULL)
3801 GOTO(out, rc = -ENOMEM);
3803 memcpy(lvbdata, lmm, lmmsize);
3804 lock_res_and_lock(lock);
3805 if (lock->l_lvb_data != NULL)
3806 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3808 lock->l_lvb_data = lvbdata;
3809 lock->l_lvb_len = lmmsize;
3810 unlock_res_and_lock(lock);
3815 ptlrpc_req_finished(req);
3820 * Apply the layout to the inode. Layout lock is held and will be released
3823 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3824 struct inode *inode, __u32 *gen, bool reconf)
3826 struct ll_inode_info *lli = ll_i2info(inode);
3827 struct ll_sb_info *sbi = ll_i2sbi(inode);
3828 struct ldlm_lock *lock;
3829 struct lustre_md md = { NULL };
3830 struct cl_object_conf conf;
3833 bool wait_layout = false;
3836 LASSERT(lustre_handle_is_used(lockh));
3838 lock = ldlm_handle2lock(lockh);
3839 LASSERT(lock != NULL);
3840 LASSERT(ldlm_has_layout(lock));
3842 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3843 PFID(&lli->lli_fid), inode, reconf);
3845 /* in case this is a caching lock and reinstate with new inode */
3846 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3848 lock_res_and_lock(lock);
3849 lvb_ready = ldlm_is_lvb_ready(lock);
3850 unlock_res_and_lock(lock);
3851 /* checking lvb_ready is racy but this is okay. The worst case is
3852 * that multi processes may configure the file on the same time. */
3854 if (lvb_ready || !reconf) {
3857 /* layout_gen must be valid if layout lock is not
3858 * cancelled and stripe has already set */
3859 *gen = ll_layout_version_get(lli);
3865 rc = ll_layout_fetch(inode, lock);
3869 /* for layout lock, lmm is returned in lock's lvb.
3870 * lvb_data is immutable if the lock is held so it's safe to access it
3871 * without res lock. See the description in ldlm_lock_decref_internal()
3872 * for the condition to free lvb_data of layout lock */
3873 if (lock->l_lvb_data != NULL) {
3874 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3875 lock->l_lvb_data, lock->l_lvb_len);
3877 *gen = LL_LAYOUT_GEN_EMPTY;
3879 *gen = md.lsm->lsm_layout_gen;
3882 CERROR("%s: file "DFID" unpackmd error: %d\n",
3883 ll_get_fsname(inode->i_sb, NULL, 0),
3884 PFID(&lli->lli_fid), rc);
3890 /* set layout to file. Unlikely this will fail as old layout was
3891 * surely eliminated */
3892 memset(&conf, 0, sizeof conf);
3893 conf.coc_opc = OBJECT_CONF_SET;
3894 conf.coc_inode = inode;
3895 conf.coc_lock = lock;
3896 conf.u.coc_md = &md;
3897 rc = ll_layout_conf(inode, &conf);
3900 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3902 /* refresh layout failed, need to wait */
3903 wait_layout = rc == -EBUSY;
3907 LDLM_LOCK_PUT(lock);
3908 ldlm_lock_decref(lockh, mode);
3910 /* wait for IO to complete if it's still being used. */
3912 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3913 ll_get_fsname(inode->i_sb, NULL, 0),
3914 PFID(&lli->lli_fid), inode);
3916 memset(&conf, 0, sizeof conf);
3917 conf.coc_opc = OBJECT_CONF_WAIT;
3918 conf.coc_inode = inode;
3919 rc = ll_layout_conf(inode, &conf);
3923 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3924 ll_get_fsname(inode->i_sb, NULL, 0),
3925 PFID(&lli->lli_fid), rc);
3931 * This function checks if there exists a LAYOUT lock on the client side,
3932 * or enqueues it if it doesn't have one in cache.
3934 * This function will not hold layout lock so it may be revoked any time after
3935 * this function returns. Any operations depend on layout should be redone
3938 * This function should be called before lov_io_init() to get an uptodate
3939 * layout version, the caller should save the version number and after IO
3940 * is finished, this function should be called again to verify that layout
3941 * is not changed during IO time.
3943 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3945 struct ll_inode_info *lli = ll_i2info(inode);
3946 struct ll_sb_info *sbi = ll_i2sbi(inode);
3947 struct md_op_data *op_data;
3948 struct lookup_intent it;
3949 struct lustre_handle lockh;
3951 struct ldlm_enqueue_info einfo = {
3952 .ei_type = LDLM_IBITS,
3954 .ei_cb_bl = &ll_md_blocking_ast,
3955 .ei_cb_cp = &ldlm_completion_ast,
3960 *gen = ll_layout_version_get(lli);
3961 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3965 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3966 LASSERT(S_ISREG(inode->i_mode));
3968 /* take layout lock mutex to enqueue layout lock exclusively. */
3969 mutex_lock(&lli->lli_layout_mutex);
3972 /* mostly layout lock is caching on the local side, so try to match
3973 * it before grabbing layout lock mutex. */
3974 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3975 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3976 if (mode != 0) { /* hit cached lock */
3977 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3981 mutex_unlock(&lli->lli_layout_mutex);
3985 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3986 0, 0, LUSTRE_OPC_ANY, NULL);
3987 if (IS_ERR(op_data)) {
3988 mutex_unlock(&lli->lli_layout_mutex);
3989 RETURN(PTR_ERR(op_data));
3992 /* have to enqueue one */
3993 memset(&it, 0, sizeof(it));
3994 it.it_op = IT_LAYOUT;
3995 lockh.cookie = 0ULL;
3997 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3998 ll_get_fsname(inode->i_sb, NULL, 0),
3999 PFID(&lli->lli_fid), inode);
4001 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4002 if (it.d.lustre.it_data != NULL)
4003 ptlrpc_req_finished(it.d.lustre.it_data);
4004 it.d.lustre.it_data = NULL;
4006 ll_finish_md_op_data(op_data);
4008 mode = it.d.lustre.it_lock_mode;
4009 it.d.lustre.it_lock_mode = 0;
4010 ll_intent_drop_lock(&it);
4013 /* set lock data in case this is a new lock */
4014 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4015 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4019 mutex_unlock(&lli->lli_layout_mutex);
4025 * This function send a restore request to the MDT
4027 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4029 struct hsm_user_request *hur;
4033 len = sizeof(struct hsm_user_request) +
4034 sizeof(struct hsm_user_item);
4035 OBD_ALLOC(hur, len);
4039 hur->hur_request.hr_action = HUA_RESTORE;
4040 hur->hur_request.hr_archive_id = 0;
4041 hur->hur_request.hr_flags = 0;
4042 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4043 sizeof(hur->hur_user_item[0].hui_fid));
4044 hur->hur_user_item[0].hui_extent.offset = offset;
4045 hur->hur_user_item[0].hui_extent.length = length;
4046 hur->hur_request.hr_itemcount = 1;
4047 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,