4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1022 ll_inode_size_lock(inode);
1023 /* merge timestamps the most recently obtained from mds with
1024 timestamps obtained from osts */
1025 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1026 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1027 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1029 lvb.lvb_size = i_size_read(inode);
1030 lvb.lvb_blocks = inode->i_blocks;
1031 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1032 lvb.lvb_atime = LTIME_S(inode->i_atime);
1033 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1035 cl_object_attr_lock(obj);
1036 rc = cl_object_attr_get(env, obj, attr);
1037 cl_object_attr_unlock(obj);
1040 if (lvb.lvb_atime < attr->cat_atime)
1041 lvb.lvb_atime = attr->cat_atime;
1042 if (lvb.lvb_ctime < attr->cat_ctime)
1043 lvb.lvb_ctime = attr->cat_ctime;
1044 if (lvb.lvb_mtime < attr->cat_mtime)
1045 lvb.lvb_mtime = attr->cat_mtime;
1047 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1048 PFID(&lli->lli_fid), attr->cat_size);
1049 cl_isize_write_nolock(inode, attr->cat_size);
1051 inode->i_blocks = attr->cat_blocks;
1053 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1054 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1055 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1057 ll_inode_size_unlock(inode);
1062 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1065 struct obdo obdo = { 0 };
1068 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1070 st->st_size = obdo.o_size;
1071 st->st_blocks = obdo.o_blocks;
1072 st->st_mtime = obdo.o_mtime;
1073 st->st_atime = obdo.o_atime;
1074 st->st_ctime = obdo.o_ctime;
1079 static bool file_is_noatime(const struct file *file)
1081 const struct vfsmount *mnt = file->f_path.mnt;
1082 const struct inode *inode = file->f_path.dentry->d_inode;
1084 /* Adapted from file_accessed() and touch_atime().*/
1085 if (file->f_flags & O_NOATIME)
1088 if (inode->i_flags & S_NOATIME)
1091 if (IS_NOATIME(inode))
1094 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1097 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1100 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1108 struct inode *inode = file->f_dentry->d_inode;
1110 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1112 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1113 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1114 file->f_flags & O_DIRECT ||
1117 io->ci_obj = ll_i2info(inode)->lli_clob;
1118 io->ci_lockreq = CILR_MAYBE;
1119 if (ll_file_nolock(file)) {
1120 io->ci_lockreq = CILR_NEVER;
1121 io->ci_no_srvlock = 1;
1122 } else if (file->f_flags & O_APPEND) {
1123 io->ci_lockreq = CILR_MANDATORY;
1126 io->ci_noatime = file_is_noatime(file);
1130 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1131 struct file *file, enum cl_io_type iot,
1132 loff_t *ppos, size_t count)
1134 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1135 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1138 struct range_lock range;
1141 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1142 file->f_dentry->d_name.name, iot, *ppos, count);
1145 io = ccc_env_thread_io(env);
1146 ll_io_init(io, file, iot == CIT_WRITE);
1148 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1149 struct vvp_io *vio = vvp_env_io(env);
1150 struct ccc_io *cio = ccc_env_io(env);
1151 bool range_locked = false;
1153 if (file->f_flags & O_APPEND)
1154 range_lock_init(&range, 0, LUSTRE_EOF);
1156 range_lock_init(&range, *ppos, *ppos + count - 1);
1157 cio->cui_fd = LUSTRE_FPRIVATE(file);
1158 vio->cui_io_subtype = args->via_io_subtype;
1160 switch (vio->cui_io_subtype) {
1162 cio->cui_iov = args->u.normal.via_iov;
1163 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1164 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1165 cio->cui_iocb = args->u.normal.via_iocb;
1166 if ((iot == CIT_WRITE) &&
1167 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1168 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1170 result = range_lock(&lli->lli_write_tree,
1175 range_locked = true;
1177 down_read(&lli->lli_trunc_sem);
1180 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1181 vio->u.splice.cui_flags = args->u.splice.via_flags;
1184 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1188 ll_cl_add(file, env, io);
1189 result = cl_io_loop(env, io);
1190 ll_cl_remove(file, env);
1192 if (args->via_io_subtype == IO_NORMAL)
1193 up_read(&lli->lli_trunc_sem);
1195 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1197 range_unlock(&lli->lli_write_tree, &range);
1200 /* cl_io_rw_init() handled IO */
1201 result = io->ci_result;
1204 if (io->ci_nob > 0) {
1205 result = io->ci_nob;
1206 *ppos = io->u.ci_wr.wr.crw_pos;
1210 cl_io_fini(env, io);
1211 /* If any bit been read/written (result != 0), we just return
1212 * short read/write instead of restart io. */
1213 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1214 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1215 iot == CIT_READ ? "read" : "write",
1216 file->f_dentry->d_name.name, *ppos, count);
1217 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1221 if (iot == CIT_READ) {
1223 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1224 LPROC_LL_READ_BYTES, result);
1225 } else if (iot == CIT_WRITE) {
1227 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1228 LPROC_LL_WRITE_BYTES, result);
1229 fd->fd_write_failed = false;
1230 } else if (result != -ERESTARTSYS) {
1231 fd->fd_write_failed = true;
1234 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1241 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1243 static int ll_file_get_iov_count(const struct iovec *iov,
1244 unsigned long *nr_segs, size_t *count)
1249 for (seg = 0; seg < *nr_segs; seg++) {
1250 const struct iovec *iv = &iov[seg];
1253 * If any segment has a negative length, or the cumulative
1254 * length ever wraps negative then return -EINVAL.
1257 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1259 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1264 cnt -= iv->iov_len; /* This segment is no good */
1271 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1272 unsigned long nr_segs, loff_t pos)
1275 struct vvp_io_args *args;
1281 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1285 env = cl_env_get(&refcheck);
1287 RETURN(PTR_ERR(env));
1289 args = vvp_env_args(env, IO_NORMAL);
1290 args->u.normal.via_iov = (struct iovec *)iov;
1291 args->u.normal.via_nrsegs = nr_segs;
1292 args->u.normal.via_iocb = iocb;
1294 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1295 &iocb->ki_pos, count);
1296 cl_env_put(env, &refcheck);
1300 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1304 struct iovec *local_iov;
1305 struct kiocb *kiocb;
1310 env = cl_env_get(&refcheck);
1312 RETURN(PTR_ERR(env));
1314 local_iov = &vvp_env_info(env)->vti_local_iov;
1315 kiocb = &vvp_env_info(env)->vti_kiocb;
1316 local_iov->iov_base = (void __user *)buf;
1317 local_iov->iov_len = count;
1318 init_sync_kiocb(kiocb, file);
1319 kiocb->ki_pos = *ppos;
1320 #ifdef HAVE_KIOCB_KI_LEFT
1321 kiocb->ki_left = count;
1323 kiocb->ki_nbytes = count;
1326 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1327 *ppos = kiocb->ki_pos;
1329 cl_env_put(env, &refcheck);
1334 * Write to a file (through the page cache).
1337 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1338 unsigned long nr_segs, loff_t pos)
1341 struct vvp_io_args *args;
1347 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1351 env = cl_env_get(&refcheck);
1353 RETURN(PTR_ERR(env));
1355 args = vvp_env_args(env, IO_NORMAL);
1356 args->u.normal.via_iov = (struct iovec *)iov;
1357 args->u.normal.via_nrsegs = nr_segs;
1358 args->u.normal.via_iocb = iocb;
1360 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1361 &iocb->ki_pos, count);
1362 cl_env_put(env, &refcheck);
1366 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1367 size_t count, loff_t *ppos)
1370 struct iovec *local_iov;
1371 struct kiocb *kiocb;
1376 env = cl_env_get(&refcheck);
1378 RETURN(PTR_ERR(env));
1380 local_iov = &vvp_env_info(env)->vti_local_iov;
1381 kiocb = &vvp_env_info(env)->vti_kiocb;
1382 local_iov->iov_base = (void __user *)buf;
1383 local_iov->iov_len = count;
1384 init_sync_kiocb(kiocb, file);
1385 kiocb->ki_pos = *ppos;
1386 #ifdef HAVE_KIOCB_KI_LEFT
1387 kiocb->ki_left = count;
1389 kiocb->ki_nbytes = count;
1392 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1393 *ppos = kiocb->ki_pos;
1395 cl_env_put(env, &refcheck);
1400 * Send file content (through pagecache) somewhere with helper
1402 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1403 struct pipe_inode_info *pipe, size_t count,
1407 struct vvp_io_args *args;
1412 env = cl_env_get(&refcheck);
1414 RETURN(PTR_ERR(env));
1416 args = vvp_env_args(env, IO_SPLICE);
1417 args->u.splice.via_pipe = pipe;
1418 args->u.splice.via_flags = flags;
1420 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1421 cl_env_put(env, &refcheck);
1425 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1426 __u64 flags, struct lov_user_md *lum,
1429 struct lov_stripe_md *lsm = NULL;
1430 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1434 lsm = ccc_inode_lsm_get(inode);
1436 ccc_inode_lsm_put(inode, lsm);
1437 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1438 PFID(ll_inode2fid(inode)));
1439 GOTO(out, rc = -EEXIST);
1442 ll_inode_size_lock(inode);
1443 oit.it_flags |= MDS_OPEN_BY_FID;
1444 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1446 GOTO(out_unlock, rc);
1447 rc = oit.d.lustre.it_status;
1449 GOTO(out_req_free, rc);
1451 ll_release_openhandle(file->f_dentry, &oit);
1454 ll_inode_size_unlock(inode);
1455 ll_intent_release(&oit);
1456 ccc_inode_lsm_put(inode, lsm);
1458 cl_lov_delay_create_clear(&file->f_flags);
1461 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1465 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1466 struct lov_mds_md **lmmp, int *lmm_size,
1467 struct ptlrpc_request **request)
1469 struct ll_sb_info *sbi = ll_i2sbi(inode);
1470 struct mdt_body *body;
1471 struct lov_mds_md *lmm = NULL;
1472 struct ptlrpc_request *req = NULL;
1473 struct md_op_data *op_data;
1476 rc = ll_get_default_mdsize(sbi, &lmmsize);
1480 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1481 strlen(filename), lmmsize,
1482 LUSTRE_OPC_ANY, NULL);
1483 if (IS_ERR(op_data))
1484 RETURN(PTR_ERR(op_data));
1486 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1487 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1488 ll_finish_md_op_data(op_data);
1490 CDEBUG(D_INFO, "md_getattr_name failed "
1491 "on %s: rc %d\n", filename, rc);
1495 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1496 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1498 lmmsize = body->mbo_eadatasize;
1500 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1502 GOTO(out, rc = -ENODATA);
1505 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1506 LASSERT(lmm != NULL);
1508 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1509 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1510 GOTO(out, rc = -EPROTO);
1514 * This is coming from the MDS, so is probably in
1515 * little endian. We convert it to host endian before
1516 * passing it to userspace.
1518 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1521 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1522 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1525 /* if function called for directory - we should
1526 * avoid swab not existent lsm objects */
1527 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1528 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1529 if (S_ISREG(body->mbo_mode))
1530 lustre_swab_lov_user_md_objects(
1531 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1533 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1534 lustre_swab_lov_user_md_v3(
1535 (struct lov_user_md_v3 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1545 *lmm_size = lmmsize;
1550 static int ll_lov_setea(struct inode *inode, struct file *file,
1553 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1554 struct lov_user_md *lump;
1555 int lum_size = sizeof(struct lov_user_md) +
1556 sizeof(struct lov_user_ost_data);
1560 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1563 OBD_ALLOC_LARGE(lump, lum_size);
1567 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1568 OBD_FREE_LARGE(lump, lum_size);
1572 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1574 OBD_FREE_LARGE(lump, lum_size);
1578 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1581 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1582 struct lov_user_md *klum;
1584 __u64 flags = FMODE_WRITE;
1587 rc = ll_copy_user_md(lum, &klum);
1592 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1594 struct lov_stripe_md *lsm;
1597 put_user(0, &lum->lmm_stripe_count);
1599 ll_layout_refresh(inode, &gen);
1600 lsm = ccc_inode_lsm_get(inode);
1601 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1603 ccc_inode_lsm_put(inode, lsm);
1606 OBD_FREE(klum, lum_size);
1610 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1612 struct lov_stripe_md *lsm;
1616 lsm = ccc_inode_lsm_get(inode);
1618 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1619 lsm, (void __user *)arg);
1620 ccc_inode_lsm_put(inode, lsm);
1625 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1627 struct ll_inode_info *lli = ll_i2info(inode);
1628 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1629 struct ccc_grouplock grouplock;
1634 CWARN("group id for group lock must not be 0\n");
1638 if (ll_file_nolock(file))
1639 RETURN(-EOPNOTSUPP);
1641 spin_lock(&lli->lli_lock);
1642 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1643 CWARN("group lock already existed with gid %lu\n",
1644 fd->fd_grouplock.cg_gid);
1645 spin_unlock(&lli->lli_lock);
1648 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1649 spin_unlock(&lli->lli_lock);
1651 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1652 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1656 spin_lock(&lli->lli_lock);
1657 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1658 spin_unlock(&lli->lli_lock);
1659 CERROR("another thread just won the race\n");
1660 cl_put_grouplock(&grouplock);
1664 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1665 fd->fd_grouplock = grouplock;
1666 spin_unlock(&lli->lli_lock);
1668 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1672 static int ll_put_grouplock(struct inode *inode, struct file *file,
1675 struct ll_inode_info *lli = ll_i2info(inode);
1676 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1677 struct ccc_grouplock grouplock;
1680 spin_lock(&lli->lli_lock);
1681 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1682 spin_unlock(&lli->lli_lock);
1683 CWARN("no group lock held\n");
1686 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1688 if (fd->fd_grouplock.cg_gid != arg) {
1689 CWARN("group lock %lu doesn't match current id %lu\n",
1690 arg, fd->fd_grouplock.cg_gid);
1691 spin_unlock(&lli->lli_lock);
1695 grouplock = fd->fd_grouplock;
1696 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1697 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1698 spin_unlock(&lli->lli_lock);
1700 cl_put_grouplock(&grouplock);
1701 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1706 * Close inode open handle
1708 * \param dentry [in] dentry which contains the inode
1709 * \param it [in,out] intent which contains open info and result
1712 * \retval <0 failure
1714 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1716 struct inode *inode = dentry->d_inode;
1717 struct obd_client_handle *och;
1723 /* Root ? Do nothing. */
1724 if (dentry->d_inode->i_sb->s_root == dentry)
1727 /* No open handle to close? Move away */
1728 if (!it_disposition(it, DISP_OPEN_OPEN))
1731 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1733 OBD_ALLOC(och, sizeof(*och));
1735 GOTO(out, rc = -ENOMEM);
1737 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1739 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1742 /* this one is in place of ll_file_open */
1743 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1744 ptlrpc_req_finished(it->d.lustre.it_data);
1745 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1751 * Get size for inode for which FIEMAP mapping is requested.
1752 * Make the FIEMAP get_info call and returns the result.
1754 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1757 struct obd_export *exp = ll_i2dtexp(inode);
1758 struct lov_stripe_md *lsm = NULL;
1759 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1760 __u32 vallen = num_bytes;
1764 /* Checks for fiemap flags */
1765 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1766 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1770 /* Check for FIEMAP_FLAG_SYNC */
1771 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1772 rc = filemap_fdatawrite(inode->i_mapping);
1777 lsm = ccc_inode_lsm_get(inode);
1781 /* If the stripe_count > 1 and the application does not understand
1782 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1784 if (lsm->lsm_stripe_count > 1 &&
1785 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1786 GOTO(out, rc = -EOPNOTSUPP);
1788 fm_key.oa.o_oi = lsm->lsm_oi;
1789 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1791 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1792 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1793 /* If filesize is 0, then there would be no objects for mapping */
1794 if (fm_key.oa.o_size == 0) {
1795 fiemap->fm_mapped_extents = 0;
1799 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1801 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1804 CERROR("obd_get_info failed: rc = %d\n", rc);
1807 ccc_inode_lsm_put(inode, lsm);
1811 int ll_fid2path(struct inode *inode, void __user *arg)
1813 struct obd_export *exp = ll_i2mdexp(inode);
1814 const struct getinfo_fid2path __user *gfin = arg;
1816 struct getinfo_fid2path *gfout;
1822 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1823 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1826 /* Only need to get the buflen */
1827 if (get_user(pathlen, &gfin->gf_pathlen))
1830 if (pathlen > PATH_MAX)
1833 outsize = sizeof(*gfout) + pathlen;
1834 OBD_ALLOC(gfout, outsize);
1838 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1839 GOTO(gf_free, rc = -EFAULT);
1841 /* Call mdc_iocontrol */
1842 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1846 if (copy_to_user(arg, gfout, outsize))
1850 OBD_FREE(gfout, outsize);
1854 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1856 struct ll_user_fiemap *fiemap_s;
1857 size_t num_bytes, ret_bytes;
1858 unsigned int extent_count;
1861 /* Get the extent count so we can calculate the size of
1862 * required fiemap buffer */
1863 if (get_user(extent_count,
1864 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1868 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1870 num_bytes = sizeof(*fiemap_s) + (extent_count *
1871 sizeof(struct ll_fiemap_extent));
1873 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1874 if (fiemap_s == NULL)
1877 /* get the fiemap value */
1878 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1880 GOTO(error, rc = -EFAULT);
1882 /* If fm_extent_count is non-zero, read the first extent since
1883 * it is used to calculate end_offset and device from previous
1886 if (copy_from_user(&fiemap_s->fm_extents[0],
1887 (char __user *)arg + sizeof(*fiemap_s),
1888 sizeof(struct ll_fiemap_extent)))
1889 GOTO(error, rc = -EFAULT);
1892 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1896 ret_bytes = sizeof(struct ll_user_fiemap);
1898 if (extent_count != 0)
1899 ret_bytes += (fiemap_s->fm_mapped_extents *
1900 sizeof(struct ll_fiemap_extent));
1902 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1906 OBD_FREE_LARGE(fiemap_s, num_bytes);
1911 * Read the data_version for inode.
1913 * This value is computed using stripe object version on OST.
1914 * Version is computed using server side locking.
1916 * @param sync if do sync on the OST side;
1918 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1919 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1921 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1923 struct lov_stripe_md *lsm = NULL;
1924 struct ll_sb_info *sbi = ll_i2sbi(inode);
1925 struct obdo *obdo = NULL;
1929 /* If no stripe, we consider version is 0. */
1930 lsm = ccc_inode_lsm_get(inode);
1931 if (!lsm_has_objects(lsm)) {
1933 CDEBUG(D_INODE, "No object for inode\n");
1937 OBD_ALLOC_PTR(obdo);
1939 GOTO(out, rc = -ENOMEM);
1941 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1943 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1946 *data_version = obdo->o_data_version;
1952 ccc_inode_lsm_put(inode, lsm);
1957 * Trigger a HSM release request for the provided inode.
1959 int ll_hsm_release(struct inode *inode)
1961 struct cl_env_nest nest;
1963 struct obd_client_handle *och = NULL;
1964 __u64 data_version = 0;
1968 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1969 ll_get_fsname(inode->i_sb, NULL, 0),
1970 PFID(&ll_i2info(inode)->lli_fid));
1972 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1974 GOTO(out, rc = PTR_ERR(och));
1976 /* Grab latest data_version and [am]time values */
1977 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1981 env = cl_env_nested_get(&nest);
1983 GOTO(out, rc = PTR_ERR(env));
1985 ll_merge_lvb(env, inode);
1986 cl_env_nested_put(&nest, env);
1988 /* Release the file.
1989 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1990 * we still need it to pack l_remote_handle to MDT. */
1991 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1997 if (och != NULL && !IS_ERR(och)) /* close the file */
1998 ll_lease_close(och, inode, NULL);
2003 struct ll_swap_stack {
2004 struct iattr ia1, ia2;
2006 struct inode *inode1, *inode2;
2007 bool check_dv1, check_dv2;
2010 static int ll_swap_layouts(struct file *file1, struct file *file2,
2011 struct lustre_swap_layouts *lsl)
2013 struct mdc_swap_layouts msl;
2014 struct md_op_data *op_data;
2017 struct ll_swap_stack *llss = NULL;
2020 OBD_ALLOC_PTR(llss);
2024 llss->inode1 = file1->f_dentry->d_inode;
2025 llss->inode2 = file2->f_dentry->d_inode;
2027 if (!S_ISREG(llss->inode2->i_mode))
2028 GOTO(free, rc = -EINVAL);
2030 if (inode_permission(llss->inode1, MAY_WRITE) ||
2031 inode_permission(llss->inode2, MAY_WRITE))
2032 GOTO(free, rc = -EPERM);
2034 if (llss->inode2->i_sb != llss->inode1->i_sb)
2035 GOTO(free, rc = -EXDEV);
2037 /* we use 2 bool because it is easier to swap than 2 bits */
2038 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2039 llss->check_dv1 = true;
2041 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2042 llss->check_dv2 = true;
2044 /* we cannot use lsl->sl_dvX directly because we may swap them */
2045 llss->dv1 = lsl->sl_dv1;
2046 llss->dv2 = lsl->sl_dv2;
2048 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2049 if (rc == 0) /* same file, done! */
2052 if (rc < 0) { /* sequentialize it */
2053 swap(llss->inode1, llss->inode2);
2055 swap(llss->dv1, llss->dv2);
2056 swap(llss->check_dv1, llss->check_dv2);
2060 if (gid != 0) { /* application asks to flush dirty cache */
2061 rc = ll_get_grouplock(llss->inode1, file1, gid);
2065 rc = ll_get_grouplock(llss->inode2, file2, gid);
2067 ll_put_grouplock(llss->inode1, file1, gid);
2072 /* to be able to restore mtime and atime after swap
2073 * we need to first save them */
2075 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2076 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2077 llss->ia1.ia_atime = llss->inode1->i_atime;
2078 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2079 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2080 llss->ia2.ia_atime = llss->inode2->i_atime;
2081 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2084 /* ultimate check, before swaping the layouts we check if
2085 * dataversion has changed (if requested) */
2086 if (llss->check_dv1) {
2087 rc = ll_data_version(llss->inode1, &dv, 0);
2090 if (dv != llss->dv1)
2091 GOTO(putgl, rc = -EAGAIN);
2094 if (llss->check_dv2) {
2095 rc = ll_data_version(llss->inode2, &dv, 0);
2098 if (dv != llss->dv2)
2099 GOTO(putgl, rc = -EAGAIN);
2102 /* struct md_op_data is used to send the swap args to the mdt
2103 * only flags is missing, so we use struct mdc_swap_layouts
2104 * through the md_op_data->op_data */
2105 /* flags from user space have to be converted before they are send to
2106 * server, no flag is sent today, they are only used on the client */
2109 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2110 0, LUSTRE_OPC_ANY, &msl);
2111 if (IS_ERR(op_data))
2112 GOTO(free, rc = PTR_ERR(op_data));
2114 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2115 sizeof(*op_data), op_data, NULL);
2116 ll_finish_md_op_data(op_data);
2120 ll_put_grouplock(llss->inode2, file2, gid);
2121 ll_put_grouplock(llss->inode1, file1, gid);
2124 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2128 /* clear useless flags */
2129 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2130 llss->ia1.ia_valid &= ~ATTR_MTIME;
2131 llss->ia2.ia_valid &= ~ATTR_MTIME;
2134 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2135 llss->ia1.ia_valid &= ~ATTR_ATIME;
2136 llss->ia2.ia_valid &= ~ATTR_ATIME;
2139 /* update time if requested */
2141 if (llss->ia2.ia_valid != 0) {
2142 mutex_lock(&llss->inode1->i_mutex);
2143 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2144 mutex_unlock(&llss->inode1->i_mutex);
2147 if (llss->ia1.ia_valid != 0) {
2150 mutex_lock(&llss->inode2->i_mutex);
2151 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2152 mutex_unlock(&llss->inode2->i_mutex);
2164 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2166 struct md_op_data *op_data;
2169 /* Non-root users are forbidden to set or clear flags which are
2170 * NOT defined in HSM_USER_MASK. */
2171 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2172 !cfs_capable(CFS_CAP_SYS_ADMIN))
2175 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2176 LUSTRE_OPC_ANY, hss);
2177 if (IS_ERR(op_data))
2178 RETURN(PTR_ERR(op_data));
2180 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2181 sizeof(*op_data), op_data, NULL);
2183 ll_finish_md_op_data(op_data);
2188 static int ll_hsm_import(struct inode *inode, struct file *file,
2189 struct hsm_user_import *hui)
2191 struct hsm_state_set *hss = NULL;
2192 struct iattr *attr = NULL;
2196 if (!S_ISREG(inode->i_mode))
2202 GOTO(out, rc = -ENOMEM);
2204 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2205 hss->hss_archive_id = hui->hui_archive_id;
2206 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2207 rc = ll_hsm_state_set(inode, hss);
2211 OBD_ALLOC_PTR(attr);
2213 GOTO(out, rc = -ENOMEM);
2215 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2216 attr->ia_mode |= S_IFREG;
2217 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2218 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2219 attr->ia_size = hui->hui_size;
2220 attr->ia_mtime.tv_sec = hui->hui_mtime;
2221 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2222 attr->ia_atime.tv_sec = hui->hui_atime;
2223 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2225 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2226 ATTR_UID | ATTR_GID |
2227 ATTR_MTIME | ATTR_MTIME_SET |
2228 ATTR_ATIME | ATTR_ATIME_SET;
2230 mutex_lock(&inode->i_mutex);
2232 rc = ll_setattr_raw(file->f_dentry, attr, true);
2236 mutex_unlock(&inode->i_mutex);
2248 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2250 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2251 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2255 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2257 struct inode *inode = file->f_dentry->d_inode;
2258 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2262 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2263 PFID(ll_inode2fid(inode)), inode, cmd);
2264 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2266 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2267 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2271 case LL_IOC_GETFLAGS:
2272 /* Get the current value of the file flags */
2273 return put_user(fd->fd_flags, (int __user *)arg);
2274 case LL_IOC_SETFLAGS:
2275 case LL_IOC_CLRFLAGS:
2276 /* Set or clear specific file flags */
2277 /* XXX This probably needs checks to ensure the flags are
2278 * not abused, and to handle any flag side effects.
2280 if (get_user(flags, (int __user *) arg))
2283 if (cmd == LL_IOC_SETFLAGS) {
2284 if ((flags & LL_FILE_IGNORE_LOCK) &&
2285 !(file->f_flags & O_DIRECT)) {
2286 CERROR("%s: unable to disable locking on "
2287 "non-O_DIRECT file\n", current->comm);
2291 fd->fd_flags |= flags;
2293 fd->fd_flags &= ~flags;
2296 case LL_IOC_LOV_SETSTRIPE:
2297 RETURN(ll_lov_setstripe(inode, file, arg));
2298 case LL_IOC_LOV_SETEA:
2299 RETURN(ll_lov_setea(inode, file, arg));
2300 case LL_IOC_LOV_SWAP_LAYOUTS: {
2302 struct lustre_swap_layouts lsl;
2304 if (copy_from_user(&lsl, (char __user *)arg,
2305 sizeof(struct lustre_swap_layouts)))
2308 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2311 file2 = fget(lsl.sl_fd);
2316 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2317 rc = ll_swap_layouts(file, file2, &lsl);
2321 case LL_IOC_LOV_GETSTRIPE:
2322 RETURN(ll_lov_getstripe(inode, arg));
2323 case FSFILT_IOC_FIEMAP:
2324 RETURN(ll_ioctl_fiemap(inode, arg));
2325 case FSFILT_IOC_GETFLAGS:
2326 case FSFILT_IOC_SETFLAGS:
2327 RETURN(ll_iocontrol(inode, file, cmd, arg));
2328 case FSFILT_IOC_GETVERSION_OLD:
2329 case FSFILT_IOC_GETVERSION:
2330 RETURN(put_user(inode->i_generation, (int __user *)arg));
2331 case LL_IOC_GROUP_LOCK:
2332 RETURN(ll_get_grouplock(inode, file, arg));
2333 case LL_IOC_GROUP_UNLOCK:
2334 RETURN(ll_put_grouplock(inode, file, arg));
2335 case IOC_OBD_STATFS:
2336 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2338 /* We need to special case any other ioctls we want to handle,
2339 * to send them to the MDS/OST as appropriate and to properly
2340 * network encode the arg field.
2341 case FSFILT_IOC_SETVERSION_OLD:
2342 case FSFILT_IOC_SETVERSION:
2344 case LL_IOC_FLUSHCTX:
2345 RETURN(ll_flush_ctx(inode));
2346 case LL_IOC_PATH2FID: {
2347 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2348 sizeof(struct lu_fid)))
2353 case LL_IOC_GETPARENT:
2354 RETURN(ll_getparent(file, (void __user *)arg));
2356 case OBD_IOC_FID2PATH:
2357 RETURN(ll_fid2path(inode, (void __user *)arg));
2358 case LL_IOC_DATA_VERSION: {
2359 struct ioc_data_version idv;
2362 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2365 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2366 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2369 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2375 case LL_IOC_GET_MDTIDX: {
2378 mdtidx = ll_get_mdt_idx(inode);
2382 if (put_user((int)mdtidx, (int __user *)arg))
2387 case OBD_IOC_GETDTNAME:
2388 case OBD_IOC_GETMDNAME:
2389 RETURN(ll_get_obd_name(inode, cmd, arg));
2390 case LL_IOC_HSM_STATE_GET: {
2391 struct md_op_data *op_data;
2392 struct hsm_user_state *hus;
2399 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2400 LUSTRE_OPC_ANY, hus);
2401 if (IS_ERR(op_data)) {
2403 RETURN(PTR_ERR(op_data));
2406 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2409 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2412 ll_finish_md_op_data(op_data);
2416 case LL_IOC_HSM_STATE_SET: {
2417 struct hsm_state_set *hss;
2424 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2429 rc = ll_hsm_state_set(inode, hss);
2434 case LL_IOC_HSM_ACTION: {
2435 struct md_op_data *op_data;
2436 struct hsm_current_action *hca;
2443 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2444 LUSTRE_OPC_ANY, hca);
2445 if (IS_ERR(op_data)) {
2447 RETURN(PTR_ERR(op_data));
2450 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2453 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2456 ll_finish_md_op_data(op_data);
2460 case LL_IOC_SET_LEASE: {
2461 struct ll_inode_info *lli = ll_i2info(inode);
2462 struct obd_client_handle *och = NULL;
2467 case LL_LEASE_WRLCK:
2468 if (!(file->f_mode & FMODE_WRITE))
2470 fmode = FMODE_WRITE;
2472 case LL_LEASE_RDLCK:
2473 if (!(file->f_mode & FMODE_READ))
2477 case LL_LEASE_UNLCK:
2478 mutex_lock(&lli->lli_och_mutex);
2479 if (fd->fd_lease_och != NULL) {
2480 och = fd->fd_lease_och;
2481 fd->fd_lease_och = NULL;
2483 mutex_unlock(&lli->lli_och_mutex);
2488 fmode = och->och_flags;
2489 rc = ll_lease_close(och, inode, &lease_broken);
2496 RETURN(ll_lease_type_from_fmode(fmode));
2501 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2503 /* apply for lease */
2504 och = ll_lease_open(inode, file, fmode, 0);
2506 RETURN(PTR_ERR(och));
2509 mutex_lock(&lli->lli_och_mutex);
2510 if (fd->fd_lease_och == NULL) {
2511 fd->fd_lease_och = och;
2514 mutex_unlock(&lli->lli_och_mutex);
2516 /* impossible now that only excl is supported for now */
2517 ll_lease_close(och, inode, &lease_broken);
2522 case LL_IOC_GET_LEASE: {
2523 struct ll_inode_info *lli = ll_i2info(inode);
2524 struct ldlm_lock *lock = NULL;
2527 mutex_lock(&lli->lli_och_mutex);
2528 if (fd->fd_lease_och != NULL) {
2529 struct obd_client_handle *och = fd->fd_lease_och;
2531 lock = ldlm_handle2lock(&och->och_lease_handle);
2533 lock_res_and_lock(lock);
2534 if (!ldlm_is_cancel(lock))
2535 fmode = och->och_flags;
2537 unlock_res_and_lock(lock);
2538 LDLM_LOCK_PUT(lock);
2541 mutex_unlock(&lli->lli_och_mutex);
2543 RETURN(ll_lease_type_from_fmode(fmode));
2545 case LL_IOC_HSM_IMPORT: {
2546 struct hsm_user_import *hui;
2552 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2557 rc = ll_hsm_import(inode, file, hui);
2567 ll_iocontrol_call(inode, file, cmd, arg, &err))
2570 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2571 (void __user *)arg));
2576 #ifndef HAVE_FILE_LLSEEK_SIZE
2577 static inline loff_t
2578 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2580 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2582 if (offset > maxsize)
2585 if (offset != file->f_pos) {
2586 file->f_pos = offset;
2587 file->f_version = 0;
2593 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2594 loff_t maxsize, loff_t eof)
2596 struct inode *inode = file->f_dentry->d_inode;
2604 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2605 * position-querying operation. Avoid rewriting the "same"
2606 * f_pos value back to the file because a concurrent read(),
2607 * write() or lseek() might have altered it
2612 * f_lock protects against read/modify/write race with other
2613 * SEEK_CURs. Note that parallel writes and reads behave
2616 mutex_lock(&inode->i_mutex);
2617 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2618 mutex_unlock(&inode->i_mutex);
2622 * In the generic case the entire file is data, so as long as
2623 * offset isn't at the end of the file then the offset is data.
2630 * There is a virtual hole at the end of the file, so as long as
2631 * offset isn't i_size or larger, return i_size.
2639 return llseek_execute(file, offset, maxsize);
2643 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2645 struct inode *inode = file->f_dentry->d_inode;
2646 loff_t retval, eof = 0;
2649 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2650 (origin == SEEK_CUR) ? file->f_pos : 0);
2651 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2652 PFID(ll_inode2fid(inode)), inode, retval, retval,
2654 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2656 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2657 retval = ll_glimpse_size(inode);
2660 eof = i_size_read(inode);
2663 retval = ll_generic_file_llseek_size(file, offset, origin,
2664 ll_file_maxbytes(inode), eof);
2668 static int ll_flush(struct file *file, fl_owner_t id)
2670 struct inode *inode = file->f_dentry->d_inode;
2671 struct ll_inode_info *lli = ll_i2info(inode);
2672 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2675 LASSERT(!S_ISDIR(inode->i_mode));
2677 /* catch async errors that were recorded back when async writeback
2678 * failed for pages in this mapping. */
2679 rc = lli->lli_async_rc;
2680 lli->lli_async_rc = 0;
2681 if (lli->lli_clob != NULL) {
2682 err = lov_read_and_clear_async_rc(lli->lli_clob);
2687 /* The application has been told write failure already.
2688 * Do not report failure again. */
2689 if (fd->fd_write_failed)
2691 return rc ? -EIO : 0;
2695 * Called to make sure a portion of file has been written out.
2696 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2698 * Return how many pages have been written.
2700 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2701 enum cl_fsync_mode mode, int ignore_layout)
2703 struct cl_env_nest nest;
2706 struct obd_capa *capa = NULL;
2707 struct cl_fsync_io *fio;
2711 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2712 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2715 env = cl_env_nested_get(&nest);
2717 RETURN(PTR_ERR(env));
2719 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2721 io = ccc_env_thread_io(env);
2722 io->ci_obj = cl_i2info(inode)->lli_clob;
2723 io->ci_ignore_layout = ignore_layout;
2725 /* initialize parameters for sync */
2726 fio = &io->u.ci_fsync;
2727 fio->fi_capa = capa;
2728 fio->fi_start = start;
2730 fio->fi_fid = ll_inode2fid(inode);
2731 fio->fi_mode = mode;
2732 fio->fi_nr_written = 0;
2734 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2735 result = cl_io_loop(env, io);
2737 result = io->ci_result;
2739 result = fio->fi_nr_written;
2740 cl_io_fini(env, io);
2741 cl_env_nested_put(&nest, env);
2749 * When dentry is provided (the 'else' case), *file->f_dentry may be
2750 * null and dentry must be used directly rather than pulled from
2751 * *file->f_dentry as is done otherwise.
2754 #ifdef HAVE_FILE_FSYNC_4ARGS
2755 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2757 struct dentry *dentry = file->f_dentry;
2758 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2759 int ll_fsync(struct file *file, int datasync)
2761 struct dentry *dentry = file->f_dentry;
2763 loff_t end = LLONG_MAX;
2765 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2768 loff_t end = LLONG_MAX;
2770 struct inode *inode = dentry->d_inode;
2771 struct ll_inode_info *lli = ll_i2info(inode);
2772 struct ptlrpc_request *req;
2773 struct obd_capa *oc;
2777 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2778 PFID(ll_inode2fid(inode)), inode);
2779 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2781 #ifdef HAVE_FILE_FSYNC_4ARGS
2782 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2783 mutex_lock(&inode->i_mutex);
2785 /* fsync's caller has already called _fdata{sync,write}, we want
2786 * that IO to finish before calling the osc and mdc sync methods */
2787 rc = filemap_fdatawait(inode->i_mapping);
2790 /* catch async errors that were recorded back when async writeback
2791 * failed for pages in this mapping. */
2792 if (!S_ISDIR(inode->i_mode)) {
2793 err = lli->lli_async_rc;
2794 lli->lli_async_rc = 0;
2797 err = lov_read_and_clear_async_rc(lli->lli_clob);
2802 oc = ll_mdscapa_get(inode);
2803 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2809 ptlrpc_req_finished(req);
2811 if (S_ISREG(inode->i_mode)) {
2812 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2814 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2815 if (rc == 0 && err < 0)
2818 fd->fd_write_failed = true;
2820 fd->fd_write_failed = false;
2823 #ifdef HAVE_FILE_FSYNC_4ARGS
2824 mutex_unlock(&inode->i_mutex);
2830 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2832 struct inode *inode = file->f_dentry->d_inode;
2833 struct ll_sb_info *sbi = ll_i2sbi(inode);
2834 struct ldlm_enqueue_info einfo = {
2835 .ei_type = LDLM_FLOCK,
2836 .ei_cb_cp = ldlm_flock_completion_ast,
2837 .ei_cbdata = file_lock,
2839 struct md_op_data *op_data;
2840 struct lustre_handle lockh = {0};
2841 ldlm_policy_data_t flock = {{0}};
2842 int fl_type = file_lock->fl_type;
2848 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2849 PFID(ll_inode2fid(inode)), file_lock);
2851 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2853 if (file_lock->fl_flags & FL_FLOCK) {
2854 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2855 /* flocks are whole-file locks */
2856 flock.l_flock.end = OFFSET_MAX;
2857 /* For flocks owner is determined by the local file desctiptor*/
2858 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2859 } else if (file_lock->fl_flags & FL_POSIX) {
2860 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2861 flock.l_flock.start = file_lock->fl_start;
2862 flock.l_flock.end = file_lock->fl_end;
2866 flock.l_flock.pid = file_lock->fl_pid;
2868 /* Somewhat ugly workaround for svc lockd.
2869 * lockd installs custom fl_lmops->lm_compare_owner that checks
2870 * for the fl_owner to be the same (which it always is on local node
2871 * I guess between lockd processes) and then compares pid.
2872 * As such we assign pid to the owner field to make it all work,
2873 * conflict with normal locks is unlikely since pid space and
2874 * pointer space for current->files are not intersecting */
2875 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2876 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2880 einfo.ei_mode = LCK_PR;
2883 /* An unlock request may or may not have any relation to
2884 * existing locks so we may not be able to pass a lock handle
2885 * via a normal ldlm_lock_cancel() request. The request may even
2886 * unlock a byte range in the middle of an existing lock. In
2887 * order to process an unlock request we need all of the same
2888 * information that is given with a normal read or write record
2889 * lock request. To avoid creating another ldlm unlock (cancel)
2890 * message we'll treat a LCK_NL flock request as an unlock. */
2891 einfo.ei_mode = LCK_NL;
2894 einfo.ei_mode = LCK_PW;
2897 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2912 flags = LDLM_FL_BLOCK_NOWAIT;
2918 flags = LDLM_FL_TEST_LOCK;
2921 CERROR("unknown fcntl lock command: %d\n", cmd);
2925 /* Save the old mode so that if the mode in the lock changes we
2926 * can decrement the appropriate reader or writer refcount. */
2927 file_lock->fl_type = einfo.ei_mode;
2929 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2930 LUSTRE_OPC_ANY, NULL);
2931 if (IS_ERR(op_data))
2932 RETURN(PTR_ERR(op_data));
2934 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2935 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2936 flock.l_flock.pid, flags, einfo.ei_mode,
2937 flock.l_flock.start, flock.l_flock.end);
2939 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2942 /* Restore the file lock type if not TEST lock. */
2943 if (!(flags & LDLM_FL_TEST_LOCK))
2944 file_lock->fl_type = fl_type;
2946 if ((file_lock->fl_flags & FL_FLOCK) &&
2947 (rc == 0 || file_lock->fl_type == F_UNLCK))
2948 rc2 = flock_lock_file_wait(file, file_lock);
2949 if ((file_lock->fl_flags & FL_POSIX) &&
2950 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2951 !(flags & LDLM_FL_TEST_LOCK))
2952 rc2 = posix_lock_file_wait(file, file_lock);
2954 if (rc2 && file_lock->fl_type != F_UNLCK) {
2955 einfo.ei_mode = LCK_NL;
2956 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2961 ll_finish_md_op_data(op_data);
2966 int ll_get_fid_by_name(struct inode *parent, const char *name,
2967 int namelen, struct lu_fid *fid)
2969 struct md_op_data *op_data = NULL;
2970 struct mdt_body *body;
2971 struct ptlrpc_request *req;
2975 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2976 LUSTRE_OPC_ANY, NULL);
2977 if (IS_ERR(op_data))
2978 RETURN(PTR_ERR(op_data));
2980 op_data->op_valid = OBD_MD_FLID;
2981 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2982 ll_finish_md_op_data(op_data);
2986 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2988 GOTO(out_req, rc = -EFAULT);
2990 *fid = body->mbo_fid1;
2992 ptlrpc_req_finished(req);
2996 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2997 const char *name, int namelen)
2999 struct dentry *dchild = NULL;
3000 struct inode *child_inode = NULL;
3001 struct md_op_data *op_data;
3002 struct ptlrpc_request *request = NULL;
3007 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3008 name, PFID(ll_inode2fid(parent)), mdtidx);
3010 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3011 0, LUSTRE_OPC_ANY, NULL);
3012 if (IS_ERR(op_data))
3013 RETURN(PTR_ERR(op_data));
3015 /* Get child FID first */
3016 qstr.hash = full_name_hash(name, namelen);
3019 dchild = d_lookup(file->f_dentry, &qstr);
3020 if (dchild != NULL && dchild->d_inode != NULL) {
3021 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3022 if (dchild->d_inode != NULL) {
3023 child_inode = igrab(dchild->d_inode);
3024 ll_invalidate_aliases(child_inode);
3028 rc = ll_get_fid_by_name(parent, name, namelen,
3034 if (!fid_is_sane(&op_data->op_fid3)) {
3035 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3036 ll_get_fsname(parent->i_sb, NULL, 0), name,
3037 PFID(&op_data->op_fid3));
3038 GOTO(out_free, rc = -EINVAL);
3041 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3046 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3047 PFID(&op_data->op_fid3), mdtidx);
3048 GOTO(out_free, rc = 0);
3051 op_data->op_mds = mdtidx;
3052 op_data->op_cli_flags = CLI_MIGRATE;
3053 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3054 namelen, name, namelen, &request);
3056 ll_update_times(request, parent);
3058 ptlrpc_req_finished(request);
3063 if (child_inode != NULL) {
3064 clear_nlink(child_inode);
3068 ll_finish_md_op_data(op_data);
3073 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3081 * test if some locks matching bits and l_req_mode are acquired
3082 * - bits can be in different locks
3083 * - if found clear the common lock bits in *bits
3084 * - the bits not found, are kept in *bits
3086 * \param bits [IN] searched lock bits [IN]
3087 * \param l_req_mode [IN] searched lock mode
3088 * \retval boolean, true iff all bits are found
3090 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3092 struct lustre_handle lockh;
3093 ldlm_policy_data_t policy;
3094 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3095 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3104 fid = &ll_i2info(inode)->lli_fid;
3105 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3106 ldlm_lockname[mode]);
3108 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3109 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3110 policy.l_inodebits.bits = *bits & (1 << i);
3111 if (policy.l_inodebits.bits == 0)
3114 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3115 &policy, mode, &lockh)) {
3116 struct ldlm_lock *lock;
3118 lock = ldlm_handle2lock(&lockh);
3121 ~(lock->l_policy_data.l_inodebits.bits);
3122 LDLM_LOCK_PUT(lock);
3124 *bits &= ~policy.l_inodebits.bits;
3131 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3132 struct lustre_handle *lockh, __u64 flags,
3135 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3140 fid = &ll_i2info(inode)->lli_fid;
3141 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3143 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3144 fid, LDLM_IBITS, &policy, mode, lockh);
3149 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3151 /* Already unlinked. Just update nlink and return success */
3152 if (rc == -ENOENT) {
3154 /* This path cannot be hit for regular files unless in
3155 * case of obscure races, so no need to to validate
3157 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3159 } else if (rc != 0) {
3160 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3161 "%s: revalidate FID "DFID" error: rc = %d\n",
3162 ll_get_fsname(inode->i_sb, NULL, 0),
3163 PFID(ll_inode2fid(inode)), rc);
3169 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3171 struct inode *inode = dentry->d_inode;
3172 struct ptlrpc_request *req = NULL;
3173 struct obd_export *exp;
3177 LASSERT(inode != NULL);
3179 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3180 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3182 exp = ll_i2mdexp(inode);
3184 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3185 * But under CMD case, it caused some lock issues, should be fixed
3186 * with new CMD ibits lock. See bug 12718 */
3187 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3188 struct lookup_intent oit = { .it_op = IT_GETATTR };
3189 struct md_op_data *op_data;
3191 if (ibits == MDS_INODELOCK_LOOKUP)
3192 oit.it_op = IT_LOOKUP;
3194 /* Call getattr by fid, so do not provide name at all. */
3195 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3196 dentry->d_inode, NULL, 0, 0,
3197 LUSTRE_OPC_ANY, NULL);
3198 if (IS_ERR(op_data))
3199 RETURN(PTR_ERR(op_data));
3201 rc = md_intent_lock(exp, op_data, &oit, &req,
3202 &ll_md_blocking_ast, 0);
3203 ll_finish_md_op_data(op_data);
3205 rc = ll_inode_revalidate_fini(inode, rc);
3209 rc = ll_revalidate_it_finish(req, &oit, dentry);
3211 ll_intent_release(&oit);
3215 /* Unlinked? Unhash dentry, so it is not picked up later by
3216 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3217 here to preserve get_cwd functionality on 2.6.
3219 if (!dentry->d_inode->i_nlink)
3220 d_lustre_invalidate(dentry, 0);
3222 ll_lookup_finish_locks(&oit, dentry);
3223 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3224 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3225 obd_valid valid = OBD_MD_FLGETATTR;
3226 struct md_op_data *op_data;
3229 if (S_ISREG(inode->i_mode)) {
3230 rc = ll_get_default_mdsize(sbi, &ealen);
3233 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3236 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3237 0, ealen, LUSTRE_OPC_ANY,
3239 if (IS_ERR(op_data))
3240 RETURN(PTR_ERR(op_data));
3242 op_data->op_valid = valid;
3243 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3244 * capa for this inode. Because we only keep capas of dirs
3246 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3247 ll_finish_md_op_data(op_data);
3249 rc = ll_inode_revalidate_fini(inode, rc);
3253 rc = ll_prep_inode(&inode, req, NULL, NULL);
3256 ptlrpc_req_finished(req);
3260 static int ll_merge_md_attr(struct inode *inode)
3262 struct cl_attr attr = { 0 };
3265 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3266 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3271 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3272 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3274 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3275 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3276 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3282 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3284 struct inode *inode = dentry->d_inode;
3288 rc = __ll_inode_revalidate(dentry, ibits);
3292 /* if object isn't regular file, don't validate size */
3293 if (!S_ISREG(inode->i_mode)) {
3294 if (S_ISDIR(inode->i_mode) &&
3295 ll_i2info(inode)->lli_lsm_md != NULL) {
3296 rc = ll_merge_md_attr(inode);
3301 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3302 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3303 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3305 /* In case of restore, the MDT has the right size and has
3306 * already send it back without granting the layout lock,
3307 * inode is up-to-date so glimpse is useless.
3308 * Also to glimpse we need the layout, in case of a running
3309 * restore the MDT holds the layout lock so the glimpse will
3310 * block up to the end of restore (getattr will block)
3312 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3313 rc = ll_glimpse_size(inode);
3318 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3320 struct inode *inode = de->d_inode;
3321 struct ll_sb_info *sbi = ll_i2sbi(inode);
3322 struct ll_inode_info *lli = ll_i2info(inode);
3325 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3326 MDS_INODELOCK_LOOKUP);
3327 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3332 stat->dev = inode->i_sb->s_dev;
3333 if (ll_need_32bit_api(sbi))
3334 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3336 stat->ino = inode->i_ino;
3337 stat->mode = inode->i_mode;
3338 stat->uid = inode->i_uid;
3339 stat->gid = inode->i_gid;
3340 stat->rdev = inode->i_rdev;
3341 stat->atime = inode->i_atime;
3342 stat->mtime = inode->i_mtime;
3343 stat->ctime = inode->i_ctime;
3344 stat->blksize = 1 << inode->i_blkbits;
3345 stat->blocks = inode->i_blocks;
3347 if (S_ISDIR(inode->i_mode) &&
3348 ll_i2info(inode)->lli_lsm_md != NULL) {
3349 stat->nlink = lli->lli_stripe_dir_nlink;
3350 stat->size = lli->lli_stripe_dir_size;
3352 stat->nlink = inode->i_nlink;
3353 stat->size = i_size_read(inode);
3359 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3360 __u64 start, __u64 len)
3364 struct ll_user_fiemap *fiemap;
3365 unsigned int extent_count = fieinfo->fi_extents_max;
3367 num_bytes = sizeof(*fiemap) + (extent_count *
3368 sizeof(struct ll_fiemap_extent));
3369 OBD_ALLOC_LARGE(fiemap, num_bytes);
3374 fiemap->fm_flags = fieinfo->fi_flags;
3375 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3376 fiemap->fm_start = start;
3377 fiemap->fm_length = len;
3378 if (extent_count > 0)
3379 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3380 sizeof(struct ll_fiemap_extent));
3382 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3384 fieinfo->fi_flags = fiemap->fm_flags;
3385 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3386 if (extent_count > 0)
3387 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3388 fiemap->fm_mapped_extents *
3389 sizeof(struct ll_fiemap_extent));
3391 OBD_FREE_LARGE(fiemap, num_bytes);
3395 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3397 struct ll_inode_info *lli = ll_i2info(inode);
3398 struct posix_acl *acl = NULL;
3401 spin_lock(&lli->lli_lock);
3402 /* VFS' acl_permission_check->check_acl will release the refcount */
3403 acl = posix_acl_dup(lli->lli_posix_acl);
3404 spin_unlock(&lli->lli_lock);
3409 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3411 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3412 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3414 ll_check_acl(struct inode *inode, int mask)
3417 # ifdef CONFIG_FS_POSIX_ACL
3418 struct posix_acl *acl;
3422 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3423 if (flags & IPERM_FLAG_RCU)
3426 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3431 rc = posix_acl_permission(inode, acl, mask);
3432 posix_acl_release(acl);
3435 # else /* !CONFIG_FS_POSIX_ACL */
3437 # endif /* CONFIG_FS_POSIX_ACL */
3439 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3441 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3442 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3444 # ifdef HAVE_INODE_PERMISION_2ARGS
3445 int ll_inode_permission(struct inode *inode, int mask)
3447 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3452 struct ll_sb_info *sbi;
3453 struct root_squash_info *squash;
3454 struct cred *cred = NULL;
3455 const struct cred *old_cred = NULL;
3457 bool squash_id = false;
3460 #ifdef MAY_NOT_BLOCK
3461 if (mask & MAY_NOT_BLOCK)
3463 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3464 if (flags & IPERM_FLAG_RCU)
3468 /* as root inode are NOT getting validated in lookup operation,
3469 * need to do it before permission check. */
3471 if (inode == inode->i_sb->s_root->d_inode) {
3472 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3473 MDS_INODELOCK_LOOKUP);
3478 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3479 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3481 /* squash fsuid/fsgid if needed */
3482 sbi = ll_i2sbi(inode);
3483 squash = &sbi->ll_squash;
3484 if (unlikely(squash->rsi_uid != 0 &&
3485 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3486 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3490 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3491 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3492 squash->rsi_uid, squash->rsi_gid);
3494 /* update current process's credentials
3495 * and FS capability */
3496 cred = prepare_creds();
3500 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3501 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3502 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3503 if ((1 << cap) & CFS_CAP_FS_MASK)
3504 cap_lower(cred->cap_effective, cap);
3506 old_cred = override_creds(cred);
3509 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3511 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3512 rc = lustre_check_remote_perm(inode, mask);
3514 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3516 /* restore current process's credentials and FS capability */
3518 revert_creds(old_cred);
3525 /* -o localflock - only provides locally consistent flock locks */
3526 struct file_operations ll_file_operations = {
3527 .read = ll_file_read,
3528 .aio_read = ll_file_aio_read,
3529 .write = ll_file_write,
3530 .aio_write = ll_file_aio_write,
3531 .unlocked_ioctl = ll_file_ioctl,
3532 .open = ll_file_open,
3533 .release = ll_file_release,
3534 .mmap = ll_file_mmap,
3535 .llseek = ll_file_seek,
3536 .splice_read = ll_file_splice_read,
3541 struct file_operations ll_file_operations_flock = {
3542 .read = ll_file_read,
3543 .aio_read = ll_file_aio_read,
3544 .write = ll_file_write,
3545 .aio_write = ll_file_aio_write,
3546 .unlocked_ioctl = ll_file_ioctl,
3547 .open = ll_file_open,
3548 .release = ll_file_release,
3549 .mmap = ll_file_mmap,
3550 .llseek = ll_file_seek,
3551 .splice_read = ll_file_splice_read,
3554 .flock = ll_file_flock,
3555 .lock = ll_file_flock
3558 /* These are for -o noflock - to return ENOSYS on flock calls */
3559 struct file_operations ll_file_operations_noflock = {
3560 .read = ll_file_read,
3561 .aio_read = ll_file_aio_read,
3562 .write = ll_file_write,
3563 .aio_write = ll_file_aio_write,
3564 .unlocked_ioctl = ll_file_ioctl,
3565 .open = ll_file_open,
3566 .release = ll_file_release,
3567 .mmap = ll_file_mmap,
3568 .llseek = ll_file_seek,
3569 .splice_read = ll_file_splice_read,
3572 .flock = ll_file_noflock,
3573 .lock = ll_file_noflock
3576 struct inode_operations ll_file_inode_operations = {
3577 .setattr = ll_setattr,
3578 .getattr = ll_getattr,
3579 .permission = ll_inode_permission,
3580 .setxattr = ll_setxattr,
3581 .getxattr = ll_getxattr,
3582 .listxattr = ll_listxattr,
3583 .removexattr = ll_removexattr,
3584 .fiemap = ll_fiemap,
3585 #ifdef HAVE_IOP_GET_ACL
3586 .get_acl = ll_get_acl,
3590 /* dynamic ioctl number support routins */
3591 static struct llioc_ctl_data {
3592 struct rw_semaphore ioc_sem;
3593 struct list_head ioc_head;
3595 __RWSEM_INITIALIZER(llioc.ioc_sem),
3596 LIST_HEAD_INIT(llioc.ioc_head)
3601 struct list_head iocd_list;
3602 unsigned int iocd_size;
3603 llioc_callback_t iocd_cb;
3604 unsigned int iocd_count;
3605 unsigned int iocd_cmd[0];
3608 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3611 struct llioc_data *in_data = NULL;
3614 if (cb == NULL || cmd == NULL ||
3615 count > LLIOC_MAX_CMD || count < 0)
3618 size = sizeof(*in_data) + count * sizeof(unsigned int);
3619 OBD_ALLOC(in_data, size);
3620 if (in_data == NULL)
3623 memset(in_data, 0, sizeof(*in_data));
3624 in_data->iocd_size = size;
3625 in_data->iocd_cb = cb;
3626 in_data->iocd_count = count;
3627 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3629 down_write(&llioc.ioc_sem);
3630 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3631 up_write(&llioc.ioc_sem);
3636 void ll_iocontrol_unregister(void *magic)
3638 struct llioc_data *tmp;
3643 down_write(&llioc.ioc_sem);
3644 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3646 unsigned int size = tmp->iocd_size;
3648 list_del(&tmp->iocd_list);
3649 up_write(&llioc.ioc_sem);
3651 OBD_FREE(tmp, size);
3655 up_write(&llioc.ioc_sem);
3657 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3660 EXPORT_SYMBOL(ll_iocontrol_register);
3661 EXPORT_SYMBOL(ll_iocontrol_unregister);
3663 static enum llioc_iter
3664 ll_iocontrol_call(struct inode *inode, struct file *file,
3665 unsigned int cmd, unsigned long arg, int *rcp)
3667 enum llioc_iter ret = LLIOC_CONT;
3668 struct llioc_data *data;
3669 int rc = -EINVAL, i;
3671 down_read(&llioc.ioc_sem);
3672 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3673 for (i = 0; i < data->iocd_count; i++) {
3674 if (cmd != data->iocd_cmd[i])
3677 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3681 if (ret == LLIOC_STOP)
3684 up_read(&llioc.ioc_sem);
3691 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3693 struct ll_inode_info *lli = ll_i2info(inode);
3694 struct cl_env_nest nest;
3699 if (lli->lli_clob == NULL)
3702 env = cl_env_nested_get(&nest);
3704 RETURN(PTR_ERR(env));
3706 result = cl_conf_set(env, lli->lli_clob, conf);
3707 cl_env_nested_put(&nest, env);
3709 if (conf->coc_opc == OBJECT_CONF_SET) {
3710 struct ldlm_lock *lock = conf->coc_lock;
3712 LASSERT(lock != NULL);
3713 LASSERT(ldlm_has_layout(lock));
3715 struct lustre_md *md = conf->u.coc_md;
3716 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3718 /* it can only be allowed to match after layout is
3719 * applied to inode otherwise false layout would be
3720 * seen. Applying layout shoud happen before dropping
3721 * the intent lock. */
3722 ldlm_lock_allow_match(lock);
3724 lli->lli_has_smd = lsm_has_objects(md->lsm);
3725 if (md->lsm != NULL)
3726 gen = md->lsm->lsm_layout_gen;
3729 DFID ": layout version change: %u -> %u\n",
3730 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3732 ll_layout_version_set(lli, gen);
3738 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3739 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3742 struct ll_sb_info *sbi = ll_i2sbi(inode);
3743 struct obd_capa *oc;
3744 struct ptlrpc_request *req;
3745 struct mdt_body *body;
3752 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3753 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3754 lock->l_lvb_data, lock->l_lvb_len);
3756 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3759 /* if layout lock was granted right away, the layout is returned
3760 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3761 * blocked and then granted via completion ast, we have to fetch
3762 * layout here. Please note that we can't use the LVB buffer in
3763 * completion AST because it doesn't have a large enough buffer */
3764 oc = ll_mdscapa_get(inode);
3765 rc = ll_get_default_mdsize(sbi, &lmmsize);
3767 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3768 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3774 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3776 GOTO(out, rc = -EPROTO);
3778 lmmsize = body->mbo_eadatasize;
3779 if (lmmsize == 0) /* empty layout */
3782 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3784 GOTO(out, rc = -EFAULT);
3786 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3787 if (lvbdata == NULL)
3788 GOTO(out, rc = -ENOMEM);
3790 memcpy(lvbdata, lmm, lmmsize);
3791 lock_res_and_lock(lock);
3792 if (lock->l_lvb_data != NULL)
3793 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3795 lock->l_lvb_data = lvbdata;
3796 lock->l_lvb_len = lmmsize;
3797 unlock_res_and_lock(lock);
3802 ptlrpc_req_finished(req);
3807 * Apply the layout to the inode. Layout lock is held and will be released
3810 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3811 struct inode *inode, __u32 *gen, bool reconf)
3813 struct ll_inode_info *lli = ll_i2info(inode);
3814 struct ll_sb_info *sbi = ll_i2sbi(inode);
3815 struct ldlm_lock *lock;
3816 struct lustre_md md = { NULL };
3817 struct cl_object_conf conf;
3820 bool wait_layout = false;
3823 LASSERT(lustre_handle_is_used(lockh));
3825 lock = ldlm_handle2lock(lockh);
3826 LASSERT(lock != NULL);
3827 LASSERT(ldlm_has_layout(lock));
3829 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3830 PFID(&lli->lli_fid), inode, reconf);
3832 /* in case this is a caching lock and reinstate with new inode */
3833 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3835 lock_res_and_lock(lock);
3836 lvb_ready = ldlm_is_lvb_ready(lock);
3837 unlock_res_and_lock(lock);
3838 /* checking lvb_ready is racy but this is okay. The worst case is
3839 * that multi processes may configure the file on the same time. */
3841 if (lvb_ready || !reconf) {
3844 /* layout_gen must be valid if layout lock is not
3845 * cancelled and stripe has already set */
3846 *gen = ll_layout_version_get(lli);
3852 rc = ll_layout_fetch(inode, lock);
3856 /* for layout lock, lmm is returned in lock's lvb.
3857 * lvb_data is immutable if the lock is held so it's safe to access it
3858 * without res lock. See the description in ldlm_lock_decref_internal()
3859 * for the condition to free lvb_data of layout lock */
3860 if (lock->l_lvb_data != NULL) {
3861 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3862 lock->l_lvb_data, lock->l_lvb_len);
3864 *gen = LL_LAYOUT_GEN_EMPTY;
3866 *gen = md.lsm->lsm_layout_gen;
3869 CERROR("%s: file "DFID" unpackmd error: %d\n",
3870 ll_get_fsname(inode->i_sb, NULL, 0),
3871 PFID(&lli->lli_fid), rc);
3877 /* set layout to file. Unlikely this will fail as old layout was
3878 * surely eliminated */
3879 memset(&conf, 0, sizeof conf);
3880 conf.coc_opc = OBJECT_CONF_SET;
3881 conf.coc_inode = inode;
3882 conf.coc_lock = lock;
3883 conf.u.coc_md = &md;
3884 rc = ll_layout_conf(inode, &conf);
3887 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3889 /* refresh layout failed, need to wait */
3890 wait_layout = rc == -EBUSY;
3894 LDLM_LOCK_PUT(lock);
3895 ldlm_lock_decref(lockh, mode);
3897 /* wait for IO to complete if it's still being used. */
3899 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3900 ll_get_fsname(inode->i_sb, NULL, 0),
3901 PFID(&lli->lli_fid), inode);
3903 memset(&conf, 0, sizeof conf);
3904 conf.coc_opc = OBJECT_CONF_WAIT;
3905 conf.coc_inode = inode;
3906 rc = ll_layout_conf(inode, &conf);
3910 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3911 ll_get_fsname(inode->i_sb, NULL, 0),
3912 PFID(&lli->lli_fid), rc);
3918 * This function checks if there exists a LAYOUT lock on the client side,
3919 * or enqueues it if it doesn't have one in cache.
3921 * This function will not hold layout lock so it may be revoked any time after
3922 * this function returns. Any operations depend on layout should be redone
3925 * This function should be called before lov_io_init() to get an uptodate
3926 * layout version, the caller should save the version number and after IO
3927 * is finished, this function should be called again to verify that layout
3928 * is not changed during IO time.
3930 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3932 struct ll_inode_info *lli = ll_i2info(inode);
3933 struct ll_sb_info *sbi = ll_i2sbi(inode);
3934 struct md_op_data *op_data;
3935 struct lookup_intent it;
3936 struct lustre_handle lockh;
3938 struct ldlm_enqueue_info einfo = {
3939 .ei_type = LDLM_IBITS,
3941 .ei_cb_bl = &ll_md_blocking_ast,
3942 .ei_cb_cp = &ldlm_completion_ast,
3947 *gen = ll_layout_version_get(lli);
3948 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3952 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3953 LASSERT(S_ISREG(inode->i_mode));
3955 /* take layout lock mutex to enqueue layout lock exclusively. */
3956 mutex_lock(&lli->lli_layout_mutex);
3959 /* mostly layout lock is caching on the local side, so try to match
3960 * it before grabbing layout lock mutex. */
3961 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3962 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3963 if (mode != 0) { /* hit cached lock */
3964 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3968 mutex_unlock(&lli->lli_layout_mutex);
3972 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3973 0, 0, LUSTRE_OPC_ANY, NULL);
3974 if (IS_ERR(op_data)) {
3975 mutex_unlock(&lli->lli_layout_mutex);
3976 RETURN(PTR_ERR(op_data));
3979 /* have to enqueue one */
3980 memset(&it, 0, sizeof(it));
3981 it.it_op = IT_LAYOUT;
3982 lockh.cookie = 0ULL;
3984 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3985 ll_get_fsname(inode->i_sb, NULL, 0),
3986 PFID(&lli->lli_fid), inode);
3988 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3989 if (it.d.lustre.it_data != NULL)
3990 ptlrpc_req_finished(it.d.lustre.it_data);
3991 it.d.lustre.it_data = NULL;
3993 ll_finish_md_op_data(op_data);
3995 mode = it.d.lustre.it_lock_mode;
3996 it.d.lustre.it_lock_mode = 0;
3997 ll_intent_drop_lock(&it);
4000 /* set lock data in case this is a new lock */
4001 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4002 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4006 mutex_unlock(&lli->lli_layout_mutex);
4012 * This function send a restore request to the MDT
4014 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4016 struct hsm_user_request *hur;
4020 len = sizeof(struct hsm_user_request) +
4021 sizeof(struct hsm_user_item);
4022 OBD_ALLOC(hur, len);
4026 hur->hur_request.hr_action = HUA_RESTORE;
4027 hur->hur_request.hr_archive_id = 0;
4028 hur->hur_request.hr_flags = 0;
4029 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4030 sizeof(hur->hur_user_item[0].hui_fid));
4031 hur->hur_user_item[0].hui_extent.offset = offset;
4032 hur->hur_user_item[0].hui_extent.length = length;
4033 hur->hur_request.hr_itemcount = 1;
4034 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,