4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1022 ll_inode_size_lock(inode);
1023 /* merge timestamps the most recently obtained from mds with
1024 timestamps obtained from osts */
1025 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1026 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1027 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1029 lvb.lvb_size = i_size_read(inode);
1030 lvb.lvb_blocks = inode->i_blocks;
1031 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1032 lvb.lvb_atime = LTIME_S(inode->i_atime);
1033 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1035 cl_object_attr_lock(obj);
1036 rc = cl_object_attr_get(env, obj, attr);
1037 cl_object_attr_unlock(obj);
1040 if (lvb.lvb_atime < attr->cat_atime)
1041 lvb.lvb_atime = attr->cat_atime;
1042 if (lvb.lvb_ctime < attr->cat_ctime)
1043 lvb.lvb_ctime = attr->cat_ctime;
1044 if (lvb.lvb_mtime < attr->cat_mtime)
1045 lvb.lvb_mtime = attr->cat_mtime;
1047 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1048 PFID(&lli->lli_fid), attr->cat_size);
1049 cl_isize_write_nolock(inode, attr->cat_size);
1051 inode->i_blocks = attr->cat_blocks;
1053 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1054 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1055 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1057 ll_inode_size_unlock(inode);
1062 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1065 struct obdo obdo = { 0 };
1068 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1070 st->st_size = obdo.o_size;
1071 st->st_blocks = obdo.o_blocks;
1072 st->st_mtime = obdo.o_mtime;
1073 st->st_atime = obdo.o_atime;
1074 st->st_ctime = obdo.o_ctime;
1079 static bool file_is_noatime(const struct file *file)
1081 const struct vfsmount *mnt = file->f_path.mnt;
1082 const struct inode *inode = file->f_path.dentry->d_inode;
1084 /* Adapted from file_accessed() and touch_atime().*/
1085 if (file->f_flags & O_NOATIME)
1088 if (inode->i_flags & S_NOATIME)
1091 if (IS_NOATIME(inode))
1094 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1097 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1100 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1108 struct inode *inode = file->f_dentry->d_inode;
1110 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1112 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1113 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1114 file->f_flags & O_DIRECT ||
1117 io->ci_obj = ll_i2info(inode)->lli_clob;
1118 io->ci_lockreq = CILR_MAYBE;
1119 if (ll_file_nolock(file)) {
1120 io->ci_lockreq = CILR_NEVER;
1121 io->ci_no_srvlock = 1;
1122 } else if (file->f_flags & O_APPEND) {
1123 io->ci_lockreq = CILR_MANDATORY;
1126 io->ci_noatime = file_is_noatime(file);
1130 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1131 struct file *file, enum cl_io_type iot,
1132 loff_t *ppos, size_t count)
1134 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1135 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1138 struct range_lock range;
1141 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1142 file->f_dentry->d_name.name, iot, *ppos, count);
1145 io = ccc_env_thread_io(env);
1146 ll_io_init(io, file, iot == CIT_WRITE);
1148 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1149 struct vvp_io *vio = vvp_env_io(env);
1150 struct ccc_io *cio = ccc_env_io(env);
1151 bool range_locked = false;
1153 if (file->f_flags & O_APPEND)
1154 range_lock_init(&range, 0, LUSTRE_EOF);
1156 range_lock_init(&range, *ppos, *ppos + count - 1);
1157 cio->cui_fd = LUSTRE_FPRIVATE(file);
1158 vio->cui_io_subtype = args->via_io_subtype;
1160 switch (vio->cui_io_subtype) {
1162 cio->cui_iov = args->u.normal.via_iov;
1163 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1164 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1165 cio->cui_iocb = args->u.normal.via_iocb;
1166 if ((iot == CIT_WRITE) &&
1167 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1168 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1170 result = range_lock(&lli->lli_write_tree,
1175 range_locked = true;
1177 down_read(&lli->lli_trunc_sem);
1180 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1181 vio->u.splice.cui_flags = args->u.splice.via_flags;
1184 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1188 ll_cl_add(file, env, io);
1189 result = cl_io_loop(env, io);
1190 ll_cl_remove(file, env);
1192 if (args->via_io_subtype == IO_NORMAL)
1193 up_read(&lli->lli_trunc_sem);
1195 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1197 range_unlock(&lli->lli_write_tree, &range);
1200 /* cl_io_rw_init() handled IO */
1201 result = io->ci_result;
1204 if (io->ci_nob > 0) {
1205 result = io->ci_nob;
1206 *ppos = io->u.ci_wr.wr.crw_pos;
1210 cl_io_fini(env, io);
1211 /* If any bit been read/written (result != 0), we just return
1212 * short read/write instead of restart io. */
1213 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1214 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1215 iot == CIT_READ ? "read" : "write",
1216 file->f_dentry->d_name.name, *ppos, count);
1217 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1221 if (iot == CIT_READ) {
1223 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1224 LPROC_LL_READ_BYTES, result);
1225 } else if (iot == CIT_WRITE) {
1227 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1228 LPROC_LL_WRITE_BYTES, result);
1229 fd->fd_write_failed = false;
1230 } else if (result != -ERESTARTSYS) {
1231 fd->fd_write_failed = true;
1234 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1241 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1243 static int ll_file_get_iov_count(const struct iovec *iov,
1244 unsigned long *nr_segs, size_t *count)
1249 for (seg = 0; seg < *nr_segs; seg++) {
1250 const struct iovec *iv = &iov[seg];
1253 * If any segment has a negative length, or the cumulative
1254 * length ever wraps negative then return -EINVAL.
1257 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1259 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1264 cnt -= iv->iov_len; /* This segment is no good */
1271 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1272 unsigned long nr_segs, loff_t pos)
1275 struct vvp_io_args *args;
1281 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1285 env = cl_env_get(&refcheck);
1287 RETURN(PTR_ERR(env));
1289 args = vvp_env_args(env, IO_NORMAL);
1290 args->u.normal.via_iov = (struct iovec *)iov;
1291 args->u.normal.via_nrsegs = nr_segs;
1292 args->u.normal.via_iocb = iocb;
1294 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1295 &iocb->ki_pos, count);
1296 cl_env_put(env, &refcheck);
1300 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1304 struct iovec *local_iov;
1305 struct kiocb *kiocb;
1310 env = cl_env_get(&refcheck);
1312 RETURN(PTR_ERR(env));
1314 local_iov = &vvp_env_info(env)->vti_local_iov;
1315 kiocb = &vvp_env_info(env)->vti_kiocb;
1316 local_iov->iov_base = (void __user *)buf;
1317 local_iov->iov_len = count;
1318 init_sync_kiocb(kiocb, file);
1319 kiocb->ki_pos = *ppos;
1320 #ifdef HAVE_KIOCB_KI_LEFT
1321 kiocb->ki_left = count;
1323 kiocb->ki_nbytes = count;
1326 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1327 *ppos = kiocb->ki_pos;
1329 cl_env_put(env, &refcheck);
1334 * Write to a file (through the page cache).
1337 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1338 unsigned long nr_segs, loff_t pos)
1341 struct vvp_io_args *args;
1347 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1351 env = cl_env_get(&refcheck);
1353 RETURN(PTR_ERR(env));
1355 args = vvp_env_args(env, IO_NORMAL);
1356 args->u.normal.via_iov = (struct iovec *)iov;
1357 args->u.normal.via_nrsegs = nr_segs;
1358 args->u.normal.via_iocb = iocb;
1360 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1361 &iocb->ki_pos, count);
1362 cl_env_put(env, &refcheck);
1366 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1367 size_t count, loff_t *ppos)
1370 struct iovec *local_iov;
1371 struct kiocb *kiocb;
1376 env = cl_env_get(&refcheck);
1378 RETURN(PTR_ERR(env));
1380 local_iov = &vvp_env_info(env)->vti_local_iov;
1381 kiocb = &vvp_env_info(env)->vti_kiocb;
1382 local_iov->iov_base = (void __user *)buf;
1383 local_iov->iov_len = count;
1384 init_sync_kiocb(kiocb, file);
1385 kiocb->ki_pos = *ppos;
1386 #ifdef HAVE_KIOCB_KI_LEFT
1387 kiocb->ki_left = count;
1389 kiocb->ki_nbytes = count;
1392 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1393 *ppos = kiocb->ki_pos;
1395 cl_env_put(env, &refcheck);
1400 * Send file content (through pagecache) somewhere with helper
1402 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1403 struct pipe_inode_info *pipe, size_t count,
1407 struct vvp_io_args *args;
1412 env = cl_env_get(&refcheck);
1414 RETURN(PTR_ERR(env));
1416 args = vvp_env_args(env, IO_SPLICE);
1417 args->u.splice.via_pipe = pipe;
1418 args->u.splice.via_flags = flags;
1420 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1421 cl_env_put(env, &refcheck);
1425 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1426 __u64 flags, struct lov_user_md *lum,
1429 struct lov_stripe_md *lsm = NULL;
1430 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1434 lsm = ccc_inode_lsm_get(inode);
1436 ccc_inode_lsm_put(inode, lsm);
1437 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1438 PFID(ll_inode2fid(inode)));
1439 GOTO(out, rc = -EEXIST);
1442 ll_inode_size_lock(inode);
1443 oit.it_flags |= MDS_OPEN_BY_FID;
1444 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1446 GOTO(out_unlock, rc);
1447 rc = oit.d.lustre.it_status;
1449 GOTO(out_req_free, rc);
1451 ll_release_openhandle(file->f_dentry, &oit);
1454 ll_inode_size_unlock(inode);
1455 ll_intent_release(&oit);
1456 ccc_inode_lsm_put(inode, lsm);
1458 cl_lov_delay_create_clear(&file->f_flags);
1461 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1465 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1466 struct lov_mds_md **lmmp, int *lmm_size,
1467 struct ptlrpc_request **request)
1469 struct ll_sb_info *sbi = ll_i2sbi(inode);
1470 struct mdt_body *body;
1471 struct lov_mds_md *lmm = NULL;
1472 struct ptlrpc_request *req = NULL;
1473 struct md_op_data *op_data;
1476 rc = ll_get_default_mdsize(sbi, &lmmsize);
1480 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1481 strlen(filename), lmmsize,
1482 LUSTRE_OPC_ANY, NULL);
1483 if (IS_ERR(op_data))
1484 RETURN(PTR_ERR(op_data));
1486 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1487 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1488 ll_finish_md_op_data(op_data);
1490 CDEBUG(D_INFO, "md_getattr_name failed "
1491 "on %s: rc %d\n", filename, rc);
1495 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1496 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1498 lmmsize = body->mbo_eadatasize;
1500 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1502 GOTO(out, rc = -ENODATA);
1505 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1506 LASSERT(lmm != NULL);
1508 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1509 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1510 GOTO(out, rc = -EPROTO);
1514 * This is coming from the MDS, so is probably in
1515 * little endian. We convert it to host endian before
1516 * passing it to userspace.
1518 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1521 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1522 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1525 /* if function called for directory - we should
1526 * avoid swab not existent lsm objects */
1527 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1528 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1529 if (S_ISREG(body->mbo_mode))
1530 lustre_swab_lov_user_md_objects(
1531 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1533 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1534 lustre_swab_lov_user_md_v3(
1535 (struct lov_user_md_v3 *)lmm);
1536 if (S_ISREG(body->mbo_mode))
1537 lustre_swab_lov_user_md_objects(
1538 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1545 *lmm_size = lmmsize;
1550 static int ll_lov_setea(struct inode *inode, struct file *file,
1553 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1554 struct lov_user_md *lump;
1555 int lum_size = sizeof(struct lov_user_md) +
1556 sizeof(struct lov_user_ost_data);
1560 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1563 OBD_ALLOC_LARGE(lump, lum_size);
1567 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1568 OBD_FREE_LARGE(lump, lum_size);
1572 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1574 OBD_FREE_LARGE(lump, lum_size);
1578 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1581 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1582 struct lov_user_md *klum;
1584 __u64 flags = FMODE_WRITE;
1587 rc = ll_copy_user_md(lum, &klum);
1592 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1594 struct lov_stripe_md *lsm;
1597 put_user(0, &lum->lmm_stripe_count);
1599 ll_layout_refresh(inode, &gen);
1600 lsm = ccc_inode_lsm_get(inode);
1601 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1603 ccc_inode_lsm_put(inode, lsm);
1606 OBD_FREE(klum, lum_size);
1610 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1612 struct lov_stripe_md *lsm;
1616 lsm = ccc_inode_lsm_get(inode);
1618 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1619 lsm, (void __user *)arg);
1620 ccc_inode_lsm_put(inode, lsm);
1625 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1627 struct ll_inode_info *lli = ll_i2info(inode);
1628 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1629 struct ccc_grouplock grouplock;
1634 CWARN("group id for group lock must not be 0\n");
1638 if (ll_file_nolock(file))
1639 RETURN(-EOPNOTSUPP);
1641 spin_lock(&lli->lli_lock);
1642 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1643 CWARN("group lock already existed with gid %lu\n",
1644 fd->fd_grouplock.cg_gid);
1645 spin_unlock(&lli->lli_lock);
1648 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1649 spin_unlock(&lli->lli_lock);
1651 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1652 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1656 spin_lock(&lli->lli_lock);
1657 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1658 spin_unlock(&lli->lli_lock);
1659 CERROR("another thread just won the race\n");
1660 cl_put_grouplock(&grouplock);
1664 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1665 fd->fd_grouplock = grouplock;
1666 spin_unlock(&lli->lli_lock);
1668 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1672 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1674 struct ll_inode_info *lli = ll_i2info(inode);
1675 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1676 struct ccc_grouplock grouplock;
1679 spin_lock(&lli->lli_lock);
1680 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1681 spin_unlock(&lli->lli_lock);
1682 CWARN("no group lock held\n");
1685 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1687 if (fd->fd_grouplock.cg_gid != arg) {
1688 CWARN("group lock %lu doesn't match current id %lu\n",
1689 arg, fd->fd_grouplock.cg_gid);
1690 spin_unlock(&lli->lli_lock);
1694 grouplock = fd->fd_grouplock;
1695 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1696 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1697 spin_unlock(&lli->lli_lock);
1699 cl_put_grouplock(&grouplock);
1700 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1705 * Close inode open handle
1707 * \param dentry [in] dentry which contains the inode
1708 * \param it [in,out] intent which contains open info and result
1711 * \retval <0 failure
1713 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1715 struct inode *inode = dentry->d_inode;
1716 struct obd_client_handle *och;
1722 /* Root ? Do nothing. */
1723 if (dentry->d_inode->i_sb->s_root == dentry)
1726 /* No open handle to close? Move away */
1727 if (!it_disposition(it, DISP_OPEN_OPEN))
1730 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1732 OBD_ALLOC(och, sizeof(*och));
1734 GOTO(out, rc = -ENOMEM);
1736 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1738 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1741 /* this one is in place of ll_file_open */
1742 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1743 ptlrpc_req_finished(it->d.lustre.it_data);
1744 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1750 * Get size for inode for which FIEMAP mapping is requested.
1751 * Make the FIEMAP get_info call and returns the result.
1753 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1756 struct obd_export *exp = ll_i2dtexp(inode);
1757 struct lov_stripe_md *lsm = NULL;
1758 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1759 __u32 vallen = num_bytes;
1763 /* Checks for fiemap flags */
1764 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1765 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1769 /* Check for FIEMAP_FLAG_SYNC */
1770 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1771 rc = filemap_fdatawrite(inode->i_mapping);
1776 lsm = ccc_inode_lsm_get(inode);
1780 /* If the stripe_count > 1 and the application does not understand
1781 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1783 if (lsm->lsm_stripe_count > 1 &&
1784 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1785 GOTO(out, rc = -EOPNOTSUPP);
1787 fm_key.oa.o_oi = lsm->lsm_oi;
1788 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1790 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1791 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1792 /* If filesize is 0, then there would be no objects for mapping */
1793 if (fm_key.oa.o_size == 0) {
1794 fiemap->fm_mapped_extents = 0;
1798 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1800 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1803 CERROR("obd_get_info failed: rc = %d\n", rc);
1806 ccc_inode_lsm_put(inode, lsm);
1810 int ll_fid2path(struct inode *inode, void __user *arg)
1812 struct obd_export *exp = ll_i2mdexp(inode);
1813 const struct getinfo_fid2path __user *gfin = arg;
1815 struct getinfo_fid2path *gfout;
1821 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1822 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1825 /* Only need to get the buflen */
1826 if (get_user(pathlen, &gfin->gf_pathlen))
1829 if (pathlen > PATH_MAX)
1832 outsize = sizeof(*gfout) + pathlen;
1833 OBD_ALLOC(gfout, outsize);
1837 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1838 GOTO(gf_free, rc = -EFAULT);
1840 /* Call mdc_iocontrol */
1841 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1845 if (copy_to_user(arg, gfout, outsize))
1849 OBD_FREE(gfout, outsize);
1853 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1855 struct ll_user_fiemap *fiemap_s;
1856 size_t num_bytes, ret_bytes;
1857 unsigned int extent_count;
1860 /* Get the extent count so we can calculate the size of
1861 * required fiemap buffer */
1862 if (get_user(extent_count,
1863 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1867 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1869 num_bytes = sizeof(*fiemap_s) + (extent_count *
1870 sizeof(struct ll_fiemap_extent));
1872 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1873 if (fiemap_s == NULL)
1876 /* get the fiemap value */
1877 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1879 GOTO(error, rc = -EFAULT);
1881 /* If fm_extent_count is non-zero, read the first extent since
1882 * it is used to calculate end_offset and device from previous
1885 if (copy_from_user(&fiemap_s->fm_extents[0],
1886 (char __user *)arg + sizeof(*fiemap_s),
1887 sizeof(struct ll_fiemap_extent)))
1888 GOTO(error, rc = -EFAULT);
1891 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1895 ret_bytes = sizeof(struct ll_user_fiemap);
1897 if (extent_count != 0)
1898 ret_bytes += (fiemap_s->fm_mapped_extents *
1899 sizeof(struct ll_fiemap_extent));
1901 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1905 OBD_FREE_LARGE(fiemap_s, num_bytes);
1910 * Read the data_version for inode.
1912 * This value is computed using stripe object version on OST.
1913 * Version is computed using server side locking.
1915 * @param sync if do sync on the OST side;
1917 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1918 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1920 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1922 struct lov_stripe_md *lsm = NULL;
1923 struct ll_sb_info *sbi = ll_i2sbi(inode);
1924 struct obdo *obdo = NULL;
1928 /* If no stripe, we consider version is 0. */
1929 lsm = ccc_inode_lsm_get(inode);
1930 if (!lsm_has_objects(lsm)) {
1932 CDEBUG(D_INODE, "No object for inode\n");
1936 OBD_ALLOC_PTR(obdo);
1938 GOTO(out, rc = -ENOMEM);
1940 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1942 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1945 *data_version = obdo->o_data_version;
1951 ccc_inode_lsm_put(inode, lsm);
1956 * Trigger a HSM release request for the provided inode.
1958 int ll_hsm_release(struct inode *inode)
1960 struct cl_env_nest nest;
1962 struct obd_client_handle *och = NULL;
1963 __u64 data_version = 0;
1967 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1968 ll_get_fsname(inode->i_sb, NULL, 0),
1969 PFID(&ll_i2info(inode)->lli_fid));
1971 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1973 GOTO(out, rc = PTR_ERR(och));
1975 /* Grab latest data_version and [am]time values */
1976 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1980 env = cl_env_nested_get(&nest);
1982 GOTO(out, rc = PTR_ERR(env));
1984 ll_merge_lvb(env, inode);
1985 cl_env_nested_put(&nest, env);
1987 /* Release the file.
1988 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1989 * we still need it to pack l_remote_handle to MDT. */
1990 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1996 if (och != NULL && !IS_ERR(och)) /* close the file */
1997 ll_lease_close(och, inode, NULL);
2002 struct ll_swap_stack {
2003 struct iattr ia1, ia2;
2005 struct inode *inode1, *inode2;
2006 bool check_dv1, check_dv2;
2009 static int ll_swap_layouts(struct file *file1, struct file *file2,
2010 struct lustre_swap_layouts *lsl)
2012 struct mdc_swap_layouts msl;
2013 struct md_op_data *op_data;
2016 struct ll_swap_stack *llss = NULL;
2019 OBD_ALLOC_PTR(llss);
2023 llss->inode1 = file1->f_dentry->d_inode;
2024 llss->inode2 = file2->f_dentry->d_inode;
2026 if (!S_ISREG(llss->inode2->i_mode))
2027 GOTO(free, rc = -EINVAL);
2029 if (inode_permission(llss->inode1, MAY_WRITE) ||
2030 inode_permission(llss->inode2, MAY_WRITE))
2031 GOTO(free, rc = -EPERM);
2033 if (llss->inode2->i_sb != llss->inode1->i_sb)
2034 GOTO(free, rc = -EXDEV);
2036 /* we use 2 bool because it is easier to swap than 2 bits */
2037 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2038 llss->check_dv1 = true;
2040 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2041 llss->check_dv2 = true;
2043 /* we cannot use lsl->sl_dvX directly because we may swap them */
2044 llss->dv1 = lsl->sl_dv1;
2045 llss->dv2 = lsl->sl_dv2;
2047 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2048 if (rc == 0) /* same file, done! */
2051 if (rc < 0) { /* sequentialize it */
2052 swap(llss->inode1, llss->inode2);
2054 swap(llss->dv1, llss->dv2);
2055 swap(llss->check_dv1, llss->check_dv2);
2059 if (gid != 0) { /* application asks to flush dirty cache */
2060 rc = ll_get_grouplock(llss->inode1, file1, gid);
2064 rc = ll_get_grouplock(llss->inode2, file2, gid);
2066 ll_put_grouplock(llss->inode1, file1, gid);
2071 /* to be able to restore mtime and atime after swap
2072 * we need to first save them */
2074 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2075 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2076 llss->ia1.ia_atime = llss->inode1->i_atime;
2077 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2078 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2079 llss->ia2.ia_atime = llss->inode2->i_atime;
2080 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2083 /* ultimate check, before swaping the layouts we check if
2084 * dataversion has changed (if requested) */
2085 if (llss->check_dv1) {
2086 rc = ll_data_version(llss->inode1, &dv, 0);
2089 if (dv != llss->dv1)
2090 GOTO(putgl, rc = -EAGAIN);
2093 if (llss->check_dv2) {
2094 rc = ll_data_version(llss->inode2, &dv, 0);
2097 if (dv != llss->dv2)
2098 GOTO(putgl, rc = -EAGAIN);
2101 /* struct md_op_data is used to send the swap args to the mdt
2102 * only flags is missing, so we use struct mdc_swap_layouts
2103 * through the md_op_data->op_data */
2104 /* flags from user space have to be converted before they are send to
2105 * server, no flag is sent today, they are only used on the client */
2108 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2109 0, LUSTRE_OPC_ANY, &msl);
2110 if (IS_ERR(op_data))
2111 GOTO(free, rc = PTR_ERR(op_data));
2113 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2114 sizeof(*op_data), op_data, NULL);
2115 ll_finish_md_op_data(op_data);
2119 ll_put_grouplock(llss->inode2, file2, gid);
2120 ll_put_grouplock(llss->inode1, file1, gid);
2123 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2127 /* clear useless flags */
2128 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2129 llss->ia1.ia_valid &= ~ATTR_MTIME;
2130 llss->ia2.ia_valid &= ~ATTR_MTIME;
2133 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2134 llss->ia1.ia_valid &= ~ATTR_ATIME;
2135 llss->ia2.ia_valid &= ~ATTR_ATIME;
2138 /* update time if requested */
2140 if (llss->ia2.ia_valid != 0) {
2141 mutex_lock(&llss->inode1->i_mutex);
2142 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2143 mutex_unlock(&llss->inode1->i_mutex);
2146 if (llss->ia1.ia_valid != 0) {
2149 mutex_lock(&llss->inode2->i_mutex);
2150 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2151 mutex_unlock(&llss->inode2->i_mutex);
2163 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2165 struct md_op_data *op_data;
2168 /* Non-root users are forbidden to set or clear flags which are
2169 * NOT defined in HSM_USER_MASK. */
2170 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2171 !cfs_capable(CFS_CAP_SYS_ADMIN))
2174 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2175 LUSTRE_OPC_ANY, hss);
2176 if (IS_ERR(op_data))
2177 RETURN(PTR_ERR(op_data));
2179 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2180 sizeof(*op_data), op_data, NULL);
2182 ll_finish_md_op_data(op_data);
2187 static int ll_hsm_import(struct inode *inode, struct file *file,
2188 struct hsm_user_import *hui)
2190 struct hsm_state_set *hss = NULL;
2191 struct iattr *attr = NULL;
2195 if (!S_ISREG(inode->i_mode))
2201 GOTO(out, rc = -ENOMEM);
2203 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2204 hss->hss_archive_id = hui->hui_archive_id;
2205 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2206 rc = ll_hsm_state_set(inode, hss);
2210 OBD_ALLOC_PTR(attr);
2212 GOTO(out, rc = -ENOMEM);
2214 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2215 attr->ia_mode |= S_IFREG;
2216 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2217 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2218 attr->ia_size = hui->hui_size;
2219 attr->ia_mtime.tv_sec = hui->hui_mtime;
2220 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2221 attr->ia_atime.tv_sec = hui->hui_atime;
2222 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2224 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2225 ATTR_UID | ATTR_GID |
2226 ATTR_MTIME | ATTR_MTIME_SET |
2227 ATTR_ATIME | ATTR_ATIME_SET;
2229 mutex_lock(&inode->i_mutex);
2231 rc = ll_setattr_raw(file->f_dentry, attr, true);
2235 mutex_unlock(&inode->i_mutex);
2247 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2249 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2250 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2254 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2256 struct inode *inode = file->f_dentry->d_inode;
2257 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2261 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2262 PFID(ll_inode2fid(inode)), inode, cmd);
2263 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2265 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2266 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2270 case LL_IOC_GETFLAGS:
2271 /* Get the current value of the file flags */
2272 return put_user(fd->fd_flags, (int __user *)arg);
2273 case LL_IOC_SETFLAGS:
2274 case LL_IOC_CLRFLAGS:
2275 /* Set or clear specific file flags */
2276 /* XXX This probably needs checks to ensure the flags are
2277 * not abused, and to handle any flag side effects.
2279 if (get_user(flags, (int __user *) arg))
2282 if (cmd == LL_IOC_SETFLAGS) {
2283 if ((flags & LL_FILE_IGNORE_LOCK) &&
2284 !(file->f_flags & O_DIRECT)) {
2285 CERROR("%s: unable to disable locking on "
2286 "non-O_DIRECT file\n", current->comm);
2290 fd->fd_flags |= flags;
2292 fd->fd_flags &= ~flags;
2295 case LL_IOC_LOV_SETSTRIPE:
2296 RETURN(ll_lov_setstripe(inode, file, arg));
2297 case LL_IOC_LOV_SETEA:
2298 RETURN(ll_lov_setea(inode, file, arg));
2299 case LL_IOC_LOV_SWAP_LAYOUTS: {
2301 struct lustre_swap_layouts lsl;
2303 if (copy_from_user(&lsl, (char __user *)arg,
2304 sizeof(struct lustre_swap_layouts)))
2307 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2310 file2 = fget(lsl.sl_fd);
2315 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2316 rc = ll_swap_layouts(file, file2, &lsl);
2320 case LL_IOC_LOV_GETSTRIPE:
2321 RETURN(ll_lov_getstripe(inode, arg));
2322 case FSFILT_IOC_FIEMAP:
2323 RETURN(ll_ioctl_fiemap(inode, arg));
2324 case FSFILT_IOC_GETFLAGS:
2325 case FSFILT_IOC_SETFLAGS:
2326 RETURN(ll_iocontrol(inode, file, cmd, arg));
2327 case FSFILT_IOC_GETVERSION_OLD:
2328 case FSFILT_IOC_GETVERSION:
2329 RETURN(put_user(inode->i_generation, (int __user *)arg));
2330 case LL_IOC_GROUP_LOCK:
2331 RETURN(ll_get_grouplock(inode, file, arg));
2332 case LL_IOC_GROUP_UNLOCK:
2333 RETURN(ll_put_grouplock(inode, file, arg));
2334 case IOC_OBD_STATFS:
2335 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2337 /* We need to special case any other ioctls we want to handle,
2338 * to send them to the MDS/OST as appropriate and to properly
2339 * network encode the arg field.
2340 case FSFILT_IOC_SETVERSION_OLD:
2341 case FSFILT_IOC_SETVERSION:
2343 case LL_IOC_FLUSHCTX:
2344 RETURN(ll_flush_ctx(inode));
2345 case LL_IOC_PATH2FID: {
2346 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2347 sizeof(struct lu_fid)))
2352 case LL_IOC_GETPARENT:
2353 RETURN(ll_getparent(file, (void __user *)arg));
2355 case OBD_IOC_FID2PATH:
2356 RETURN(ll_fid2path(inode, (void __user *)arg));
2357 case LL_IOC_DATA_VERSION: {
2358 struct ioc_data_version idv;
2361 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2364 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2365 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2368 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2374 case LL_IOC_GET_MDTIDX: {
2377 mdtidx = ll_get_mdt_idx(inode);
2381 if (put_user((int)mdtidx, (int __user *)arg))
2386 case OBD_IOC_GETDTNAME:
2387 case OBD_IOC_GETMDNAME:
2388 RETURN(ll_get_obd_name(inode, cmd, arg));
2389 case LL_IOC_HSM_STATE_GET: {
2390 struct md_op_data *op_data;
2391 struct hsm_user_state *hus;
2398 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2399 LUSTRE_OPC_ANY, hus);
2400 if (IS_ERR(op_data)) {
2402 RETURN(PTR_ERR(op_data));
2405 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2408 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2411 ll_finish_md_op_data(op_data);
2415 case LL_IOC_HSM_STATE_SET: {
2416 struct hsm_state_set *hss;
2423 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2428 rc = ll_hsm_state_set(inode, hss);
2433 case LL_IOC_HSM_ACTION: {
2434 struct md_op_data *op_data;
2435 struct hsm_current_action *hca;
2442 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2443 LUSTRE_OPC_ANY, hca);
2444 if (IS_ERR(op_data)) {
2446 RETURN(PTR_ERR(op_data));
2449 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2452 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2455 ll_finish_md_op_data(op_data);
2459 case LL_IOC_SET_LEASE: {
2460 struct ll_inode_info *lli = ll_i2info(inode);
2461 struct obd_client_handle *och = NULL;
2466 case LL_LEASE_WRLCK:
2467 if (!(file->f_mode & FMODE_WRITE))
2469 fmode = FMODE_WRITE;
2471 case LL_LEASE_RDLCK:
2472 if (!(file->f_mode & FMODE_READ))
2476 case LL_LEASE_UNLCK:
2477 mutex_lock(&lli->lli_och_mutex);
2478 if (fd->fd_lease_och != NULL) {
2479 och = fd->fd_lease_och;
2480 fd->fd_lease_och = NULL;
2482 mutex_unlock(&lli->lli_och_mutex);
2487 fmode = och->och_flags;
2488 rc = ll_lease_close(och, inode, &lease_broken);
2495 RETURN(ll_lease_type_from_fmode(fmode));
2500 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2502 /* apply for lease */
2503 och = ll_lease_open(inode, file, fmode, 0);
2505 RETURN(PTR_ERR(och));
2508 mutex_lock(&lli->lli_och_mutex);
2509 if (fd->fd_lease_och == NULL) {
2510 fd->fd_lease_och = och;
2513 mutex_unlock(&lli->lli_och_mutex);
2515 /* impossible now that only excl is supported for now */
2516 ll_lease_close(och, inode, &lease_broken);
2521 case LL_IOC_GET_LEASE: {
2522 struct ll_inode_info *lli = ll_i2info(inode);
2523 struct ldlm_lock *lock = NULL;
2526 mutex_lock(&lli->lli_och_mutex);
2527 if (fd->fd_lease_och != NULL) {
2528 struct obd_client_handle *och = fd->fd_lease_och;
2530 lock = ldlm_handle2lock(&och->och_lease_handle);
2532 lock_res_and_lock(lock);
2533 if (!ldlm_is_cancel(lock))
2534 fmode = och->och_flags;
2536 unlock_res_and_lock(lock);
2537 LDLM_LOCK_PUT(lock);
2540 mutex_unlock(&lli->lli_och_mutex);
2542 RETURN(ll_lease_type_from_fmode(fmode));
2544 case LL_IOC_HSM_IMPORT: {
2545 struct hsm_user_import *hui;
2551 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2556 rc = ll_hsm_import(inode, file, hui);
2566 ll_iocontrol_call(inode, file, cmd, arg, &err))
2569 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2570 (void __user *)arg));
2575 #ifndef HAVE_FILE_LLSEEK_SIZE
2576 static inline loff_t
2577 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2579 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2581 if (offset > maxsize)
2584 if (offset != file->f_pos) {
2585 file->f_pos = offset;
2586 file->f_version = 0;
2592 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2593 loff_t maxsize, loff_t eof)
2595 struct inode *inode = file->f_dentry->d_inode;
2603 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2604 * position-querying operation. Avoid rewriting the "same"
2605 * f_pos value back to the file because a concurrent read(),
2606 * write() or lseek() might have altered it
2611 * f_lock protects against read/modify/write race with other
2612 * SEEK_CURs. Note that parallel writes and reads behave
2615 mutex_lock(&inode->i_mutex);
2616 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2617 mutex_unlock(&inode->i_mutex);
2621 * In the generic case the entire file is data, so as long as
2622 * offset isn't at the end of the file then the offset is data.
2629 * There is a virtual hole at the end of the file, so as long as
2630 * offset isn't i_size or larger, return i_size.
2638 return llseek_execute(file, offset, maxsize);
2642 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2644 struct inode *inode = file->f_dentry->d_inode;
2645 loff_t retval, eof = 0;
2648 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2649 (origin == SEEK_CUR) ? file->f_pos : 0);
2650 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2651 PFID(ll_inode2fid(inode)), inode, retval, retval,
2653 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2655 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2656 retval = ll_glimpse_size(inode);
2659 eof = i_size_read(inode);
2662 retval = ll_generic_file_llseek_size(file, offset, origin,
2663 ll_file_maxbytes(inode), eof);
2667 static int ll_flush(struct file *file, fl_owner_t id)
2669 struct inode *inode = file->f_dentry->d_inode;
2670 struct ll_inode_info *lli = ll_i2info(inode);
2671 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2674 LASSERT(!S_ISDIR(inode->i_mode));
2676 /* catch async errors that were recorded back when async writeback
2677 * failed for pages in this mapping. */
2678 rc = lli->lli_async_rc;
2679 lli->lli_async_rc = 0;
2680 if (lli->lli_clob != NULL) {
2681 err = lov_read_and_clear_async_rc(lli->lli_clob);
2686 /* The application has been told write failure already.
2687 * Do not report failure again. */
2688 if (fd->fd_write_failed)
2690 return rc ? -EIO : 0;
2694 * Called to make sure a portion of file has been written out.
2695 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2697 * Return how many pages have been written.
2699 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2700 enum cl_fsync_mode mode, int ignore_layout)
2702 struct cl_env_nest nest;
2705 struct obd_capa *capa = NULL;
2706 struct cl_fsync_io *fio;
2710 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2711 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2714 env = cl_env_nested_get(&nest);
2716 RETURN(PTR_ERR(env));
2718 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2720 io = ccc_env_thread_io(env);
2721 io->ci_obj = cl_i2info(inode)->lli_clob;
2722 io->ci_ignore_layout = ignore_layout;
2724 /* initialize parameters for sync */
2725 fio = &io->u.ci_fsync;
2726 fio->fi_capa = capa;
2727 fio->fi_start = start;
2729 fio->fi_fid = ll_inode2fid(inode);
2730 fio->fi_mode = mode;
2731 fio->fi_nr_written = 0;
2733 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2734 result = cl_io_loop(env, io);
2736 result = io->ci_result;
2738 result = fio->fi_nr_written;
2739 cl_io_fini(env, io);
2740 cl_env_nested_put(&nest, env);
2748 * When dentry is provided (the 'else' case), *file->f_dentry may be
2749 * null and dentry must be used directly rather than pulled from
2750 * *file->f_dentry as is done otherwise.
2753 #ifdef HAVE_FILE_FSYNC_4ARGS
2754 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2756 struct dentry *dentry = file->f_dentry;
2757 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2758 int ll_fsync(struct file *file, int datasync)
2760 struct dentry *dentry = file->f_dentry;
2762 loff_t end = LLONG_MAX;
2764 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2767 loff_t end = LLONG_MAX;
2769 struct inode *inode = dentry->d_inode;
2770 struct ll_inode_info *lli = ll_i2info(inode);
2771 struct ptlrpc_request *req;
2772 struct obd_capa *oc;
2776 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2777 PFID(ll_inode2fid(inode)), inode);
2778 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2780 #ifdef HAVE_FILE_FSYNC_4ARGS
2781 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2782 mutex_lock(&inode->i_mutex);
2784 /* fsync's caller has already called _fdata{sync,write}, we want
2785 * that IO to finish before calling the osc and mdc sync methods */
2786 rc = filemap_fdatawait(inode->i_mapping);
2789 /* catch async errors that were recorded back when async writeback
2790 * failed for pages in this mapping. */
2791 if (!S_ISDIR(inode->i_mode)) {
2792 err = lli->lli_async_rc;
2793 lli->lli_async_rc = 0;
2796 err = lov_read_and_clear_async_rc(lli->lli_clob);
2801 oc = ll_mdscapa_get(inode);
2802 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2808 ptlrpc_req_finished(req);
2810 if (S_ISREG(inode->i_mode)) {
2811 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2813 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2814 if (rc == 0 && err < 0)
2817 fd->fd_write_failed = true;
2819 fd->fd_write_failed = false;
2822 #ifdef HAVE_FILE_FSYNC_4ARGS
2823 mutex_unlock(&inode->i_mutex);
2829 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2831 struct inode *inode = file->f_dentry->d_inode;
2832 struct ll_sb_info *sbi = ll_i2sbi(inode);
2833 struct ldlm_enqueue_info einfo = {
2834 .ei_type = LDLM_FLOCK,
2835 .ei_cb_cp = ldlm_flock_completion_ast,
2836 .ei_cbdata = file_lock,
2838 struct md_op_data *op_data;
2839 struct lustre_handle lockh = {0};
2840 ldlm_policy_data_t flock = {{0}};
2841 int fl_type = file_lock->fl_type;
2847 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2848 PFID(ll_inode2fid(inode)), file_lock);
2850 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2852 if (file_lock->fl_flags & FL_FLOCK) {
2853 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2854 /* flocks are whole-file locks */
2855 flock.l_flock.end = OFFSET_MAX;
2856 /* For flocks owner is determined by the local file desctiptor*/
2857 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2858 } else if (file_lock->fl_flags & FL_POSIX) {
2859 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2860 flock.l_flock.start = file_lock->fl_start;
2861 flock.l_flock.end = file_lock->fl_end;
2865 flock.l_flock.pid = file_lock->fl_pid;
2867 /* Somewhat ugly workaround for svc lockd.
2868 * lockd installs custom fl_lmops->lm_compare_owner that checks
2869 * for the fl_owner to be the same (which it always is on local node
2870 * I guess between lockd processes) and then compares pid.
2871 * As such we assign pid to the owner field to make it all work,
2872 * conflict with normal locks is unlikely since pid space and
2873 * pointer space for current->files are not intersecting */
2874 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2875 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2879 einfo.ei_mode = LCK_PR;
2882 /* An unlock request may or may not have any relation to
2883 * existing locks so we may not be able to pass a lock handle
2884 * via a normal ldlm_lock_cancel() request. The request may even
2885 * unlock a byte range in the middle of an existing lock. In
2886 * order to process an unlock request we need all of the same
2887 * information that is given with a normal read or write record
2888 * lock request. To avoid creating another ldlm unlock (cancel)
2889 * message we'll treat a LCK_NL flock request as an unlock. */
2890 einfo.ei_mode = LCK_NL;
2893 einfo.ei_mode = LCK_PW;
2896 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2911 flags = LDLM_FL_BLOCK_NOWAIT;
2917 flags = LDLM_FL_TEST_LOCK;
2920 CERROR("unknown fcntl lock command: %d\n", cmd);
2924 /* Save the old mode so that if the mode in the lock changes we
2925 * can decrement the appropriate reader or writer refcount. */
2926 file_lock->fl_type = einfo.ei_mode;
2928 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2929 LUSTRE_OPC_ANY, NULL);
2930 if (IS_ERR(op_data))
2931 RETURN(PTR_ERR(op_data));
2933 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2934 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2935 flock.l_flock.pid, flags, einfo.ei_mode,
2936 flock.l_flock.start, flock.l_flock.end);
2938 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2941 /* Restore the file lock type if not TEST lock. */
2942 if (!(flags & LDLM_FL_TEST_LOCK))
2943 file_lock->fl_type = fl_type;
2945 if ((file_lock->fl_flags & FL_FLOCK) &&
2946 (rc == 0 || file_lock->fl_type == F_UNLCK))
2947 rc2 = flock_lock_file_wait(file, file_lock);
2948 if ((file_lock->fl_flags & FL_POSIX) &&
2949 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2950 !(flags & LDLM_FL_TEST_LOCK))
2951 rc2 = posix_lock_file_wait(file, file_lock);
2953 if (rc2 && file_lock->fl_type != F_UNLCK) {
2954 einfo.ei_mode = LCK_NL;
2955 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2960 ll_finish_md_op_data(op_data);
2965 int ll_get_fid_by_name(struct inode *parent, const char *name,
2966 int namelen, struct lu_fid *fid)
2968 struct md_op_data *op_data = NULL;
2969 struct mdt_body *body;
2970 struct ptlrpc_request *req;
2974 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2975 LUSTRE_OPC_ANY, NULL);
2976 if (IS_ERR(op_data))
2977 RETURN(PTR_ERR(op_data));
2979 op_data->op_valid = OBD_MD_FLID;
2980 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2981 ll_finish_md_op_data(op_data);
2985 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2987 GOTO(out_req, rc = -EFAULT);
2989 *fid = body->mbo_fid1;
2991 ptlrpc_req_finished(req);
2995 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2996 const char *name, int namelen)
2998 struct dentry *dchild = NULL;
2999 struct inode *child_inode = NULL;
3000 struct md_op_data *op_data;
3001 struct ptlrpc_request *request = NULL;
3006 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3007 name, PFID(ll_inode2fid(parent)), mdtidx);
3009 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3010 0, LUSTRE_OPC_ANY, NULL);
3011 if (IS_ERR(op_data))
3012 RETURN(PTR_ERR(op_data));
3014 /* Get child FID first */
3015 qstr.hash = full_name_hash(name, namelen);
3018 dchild = d_lookup(file->f_dentry, &qstr);
3019 if (dchild != NULL && dchild->d_inode != NULL) {
3020 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3021 if (dchild->d_inode != NULL) {
3022 child_inode = igrab(dchild->d_inode);
3023 ll_invalidate_aliases(child_inode);
3027 rc = ll_get_fid_by_name(parent, name, namelen,
3033 if (!fid_is_sane(&op_data->op_fid3)) {
3034 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3035 ll_get_fsname(parent->i_sb, NULL, 0), name,
3036 PFID(&op_data->op_fid3));
3037 GOTO(out_free, rc = -EINVAL);
3040 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3045 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3046 PFID(&op_data->op_fid3), mdtidx);
3047 GOTO(out_free, rc = 0);
3050 op_data->op_mds = mdtidx;
3051 op_data->op_cli_flags = CLI_MIGRATE;
3052 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3053 namelen, name, namelen, &request);
3055 ll_update_times(request, parent);
3057 ptlrpc_req_finished(request);
3062 if (child_inode != NULL) {
3063 clear_nlink(child_inode);
3067 ll_finish_md_op_data(op_data);
3072 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3080 * test if some locks matching bits and l_req_mode are acquired
3081 * - bits can be in different locks
3082 * - if found clear the common lock bits in *bits
3083 * - the bits not found, are kept in *bits
3085 * \param bits [IN] searched lock bits [IN]
3086 * \param l_req_mode [IN] searched lock mode
3087 * \retval boolean, true iff all bits are found
3089 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3091 struct lustre_handle lockh;
3092 ldlm_policy_data_t policy;
3093 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3094 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3103 fid = &ll_i2info(inode)->lli_fid;
3104 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3105 ldlm_lockname[mode]);
3107 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3108 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3109 policy.l_inodebits.bits = *bits & (1 << i);
3110 if (policy.l_inodebits.bits == 0)
3113 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3114 &policy, mode, &lockh)) {
3115 struct ldlm_lock *lock;
3117 lock = ldlm_handle2lock(&lockh);
3120 ~(lock->l_policy_data.l_inodebits.bits);
3121 LDLM_LOCK_PUT(lock);
3123 *bits &= ~policy.l_inodebits.bits;
3130 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3131 struct lustre_handle *lockh, __u64 flags,
3134 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3139 fid = &ll_i2info(inode)->lli_fid;
3140 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3142 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3143 fid, LDLM_IBITS, &policy, mode, lockh);
3148 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3150 /* Already unlinked. Just update nlink and return success */
3151 if (rc == -ENOENT) {
3153 /* This path cannot be hit for regular files unless in
3154 * case of obscure races, so no need to to validate
3156 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3158 } else if (rc != 0) {
3159 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3160 "%s: revalidate FID "DFID" error: rc = %d\n",
3161 ll_get_fsname(inode->i_sb, NULL, 0),
3162 PFID(ll_inode2fid(inode)), rc);
3168 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3170 struct inode *inode = dentry->d_inode;
3171 struct ptlrpc_request *req = NULL;
3172 struct obd_export *exp;
3176 LASSERT(inode != NULL);
3178 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3179 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3181 exp = ll_i2mdexp(inode);
3183 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3184 * But under CMD case, it caused some lock issues, should be fixed
3185 * with new CMD ibits lock. See bug 12718 */
3186 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3187 struct lookup_intent oit = { .it_op = IT_GETATTR };
3188 struct md_op_data *op_data;
3190 if (ibits == MDS_INODELOCK_LOOKUP)
3191 oit.it_op = IT_LOOKUP;
3193 /* Call getattr by fid, so do not provide name at all. */
3194 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3195 dentry->d_inode, NULL, 0, 0,
3196 LUSTRE_OPC_ANY, NULL);
3197 if (IS_ERR(op_data))
3198 RETURN(PTR_ERR(op_data));
3200 rc = md_intent_lock(exp, op_data, &oit, &req,
3201 &ll_md_blocking_ast, 0);
3202 ll_finish_md_op_data(op_data);
3204 rc = ll_inode_revalidate_fini(inode, rc);
3208 rc = ll_revalidate_it_finish(req, &oit, dentry);
3210 ll_intent_release(&oit);
3214 /* Unlinked? Unhash dentry, so it is not picked up later by
3215 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3216 here to preserve get_cwd functionality on 2.6.
3218 if (!dentry->d_inode->i_nlink)
3219 d_lustre_invalidate(dentry, 0);
3221 ll_lookup_finish_locks(&oit, dentry);
3222 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3223 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3224 obd_valid valid = OBD_MD_FLGETATTR;
3225 struct md_op_data *op_data;
3228 if (S_ISREG(inode->i_mode)) {
3229 rc = ll_get_default_mdsize(sbi, &ealen);
3232 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3235 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3236 0, ealen, LUSTRE_OPC_ANY,
3238 if (IS_ERR(op_data))
3239 RETURN(PTR_ERR(op_data));
3241 op_data->op_valid = valid;
3242 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3243 * capa for this inode. Because we only keep capas of dirs
3245 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3246 ll_finish_md_op_data(op_data);
3248 rc = ll_inode_revalidate_fini(inode, rc);
3252 rc = ll_prep_inode(&inode, req, NULL, NULL);
3255 ptlrpc_req_finished(req);
3259 static int ll_merge_md_attr(struct inode *inode)
3261 struct cl_attr attr = { 0 };
3264 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3265 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3270 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3271 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3273 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3274 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3275 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3281 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3283 struct inode *inode = dentry->d_inode;
3287 rc = __ll_inode_revalidate(dentry, ibits);
3291 /* if object isn't regular file, don't validate size */
3292 if (!S_ISREG(inode->i_mode)) {
3293 if (S_ISDIR(inode->i_mode) &&
3294 ll_i2info(inode)->lli_lsm_md != NULL) {
3295 rc = ll_merge_md_attr(inode);
3300 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3301 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3302 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3304 /* In case of restore, the MDT has the right size and has
3305 * already send it back without granting the layout lock,
3306 * inode is up-to-date so glimpse is useless.
3307 * Also to glimpse we need the layout, in case of a running
3308 * restore the MDT holds the layout lock so the glimpse will
3309 * block up to the end of restore (getattr will block)
3311 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3312 rc = ll_glimpse_size(inode);
3317 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3319 struct inode *inode = de->d_inode;
3320 struct ll_sb_info *sbi = ll_i2sbi(inode);
3321 struct ll_inode_info *lli = ll_i2info(inode);
3324 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3325 MDS_INODELOCK_LOOKUP);
3326 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3331 stat->dev = inode->i_sb->s_dev;
3332 if (ll_need_32bit_api(sbi))
3333 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3335 stat->ino = inode->i_ino;
3336 stat->mode = inode->i_mode;
3337 stat->uid = inode->i_uid;
3338 stat->gid = inode->i_gid;
3339 stat->rdev = inode->i_rdev;
3340 stat->atime = inode->i_atime;
3341 stat->mtime = inode->i_mtime;
3342 stat->ctime = inode->i_ctime;
3343 stat->blksize = 1 << inode->i_blkbits;
3344 stat->blocks = inode->i_blocks;
3346 if (S_ISDIR(inode->i_mode) &&
3347 ll_i2info(inode)->lli_lsm_md != NULL) {
3348 stat->nlink = lli->lli_stripe_dir_nlink;
3349 stat->size = lli->lli_stripe_dir_size;
3351 stat->nlink = inode->i_nlink;
3352 stat->size = i_size_read(inode);
3358 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3359 __u64 start, __u64 len)
3363 struct ll_user_fiemap *fiemap;
3364 unsigned int extent_count = fieinfo->fi_extents_max;
3366 num_bytes = sizeof(*fiemap) + (extent_count *
3367 sizeof(struct ll_fiemap_extent));
3368 OBD_ALLOC_LARGE(fiemap, num_bytes);
3373 fiemap->fm_flags = fieinfo->fi_flags;
3374 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3375 fiemap->fm_start = start;
3376 fiemap->fm_length = len;
3377 if (extent_count > 0)
3378 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3379 sizeof(struct ll_fiemap_extent));
3381 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3383 fieinfo->fi_flags = fiemap->fm_flags;
3384 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3385 if (extent_count > 0)
3386 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3387 fiemap->fm_mapped_extents *
3388 sizeof(struct ll_fiemap_extent));
3390 OBD_FREE_LARGE(fiemap, num_bytes);
3394 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3396 struct ll_inode_info *lli = ll_i2info(inode);
3397 struct posix_acl *acl = NULL;
3400 spin_lock(&lli->lli_lock);
3401 /* VFS' acl_permission_check->check_acl will release the refcount */
3402 acl = posix_acl_dup(lli->lli_posix_acl);
3403 spin_unlock(&lli->lli_lock);
3408 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3410 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3411 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3413 ll_check_acl(struct inode *inode, int mask)
3416 # ifdef CONFIG_FS_POSIX_ACL
3417 struct posix_acl *acl;
3421 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3422 if (flags & IPERM_FLAG_RCU)
3425 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3430 rc = posix_acl_permission(inode, acl, mask);
3431 posix_acl_release(acl);
3434 # else /* !CONFIG_FS_POSIX_ACL */
3436 # endif /* CONFIG_FS_POSIX_ACL */
3438 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3440 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3441 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3443 # ifdef HAVE_INODE_PERMISION_2ARGS
3444 int ll_inode_permission(struct inode *inode, int mask)
3446 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3451 struct ll_sb_info *sbi;
3452 struct root_squash_info *squash;
3453 struct cred *cred = NULL;
3454 const struct cred *old_cred = NULL;
3456 bool squash_id = false;
3459 #ifdef MAY_NOT_BLOCK
3460 if (mask & MAY_NOT_BLOCK)
3462 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3463 if (flags & IPERM_FLAG_RCU)
3467 /* as root inode are NOT getting validated in lookup operation,
3468 * need to do it before permission check. */
3470 if (inode == inode->i_sb->s_root->d_inode) {
3471 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3472 MDS_INODELOCK_LOOKUP);
3477 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3478 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3480 /* squash fsuid/fsgid if needed */
3481 sbi = ll_i2sbi(inode);
3482 squash = &sbi->ll_squash;
3483 if (unlikely(squash->rsi_uid != 0 &&
3484 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3485 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3489 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3490 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3491 squash->rsi_uid, squash->rsi_gid);
3493 /* update current process's credentials
3494 * and FS capability */
3495 cred = prepare_creds();
3499 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3500 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3501 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3502 if ((1 << cap) & CFS_CAP_FS_MASK)
3503 cap_lower(cred->cap_effective, cap);
3505 old_cred = override_creds(cred);
3508 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3510 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3511 rc = lustre_check_remote_perm(inode, mask);
3513 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3515 /* restore current process's credentials and FS capability */
3517 revert_creds(old_cred);
3524 /* -o localflock - only provides locally consistent flock locks */
3525 struct file_operations ll_file_operations = {
3526 .read = ll_file_read,
3527 .aio_read = ll_file_aio_read,
3528 .write = ll_file_write,
3529 .aio_write = ll_file_aio_write,
3530 .unlocked_ioctl = ll_file_ioctl,
3531 .open = ll_file_open,
3532 .release = ll_file_release,
3533 .mmap = ll_file_mmap,
3534 .llseek = ll_file_seek,
3535 .splice_read = ll_file_splice_read,
3540 struct file_operations ll_file_operations_flock = {
3541 .read = ll_file_read,
3542 .aio_read = ll_file_aio_read,
3543 .write = ll_file_write,
3544 .aio_write = ll_file_aio_write,
3545 .unlocked_ioctl = ll_file_ioctl,
3546 .open = ll_file_open,
3547 .release = ll_file_release,
3548 .mmap = ll_file_mmap,
3549 .llseek = ll_file_seek,
3550 .splice_read = ll_file_splice_read,
3553 .flock = ll_file_flock,
3554 .lock = ll_file_flock
3557 /* These are for -o noflock - to return ENOSYS on flock calls */
3558 struct file_operations ll_file_operations_noflock = {
3559 .read = ll_file_read,
3560 .aio_read = ll_file_aio_read,
3561 .write = ll_file_write,
3562 .aio_write = ll_file_aio_write,
3563 .unlocked_ioctl = ll_file_ioctl,
3564 .open = ll_file_open,
3565 .release = ll_file_release,
3566 .mmap = ll_file_mmap,
3567 .llseek = ll_file_seek,
3568 .splice_read = ll_file_splice_read,
3571 .flock = ll_file_noflock,
3572 .lock = ll_file_noflock
3575 struct inode_operations ll_file_inode_operations = {
3576 .setattr = ll_setattr,
3577 .getattr = ll_getattr,
3578 .permission = ll_inode_permission,
3579 .setxattr = ll_setxattr,
3580 .getxattr = ll_getxattr,
3581 .listxattr = ll_listxattr,
3582 .removexattr = ll_removexattr,
3583 .fiemap = ll_fiemap,
3584 #ifdef HAVE_IOP_GET_ACL
3585 .get_acl = ll_get_acl,
3589 /* dynamic ioctl number support routins */
3590 static struct llioc_ctl_data {
3591 struct rw_semaphore ioc_sem;
3592 struct list_head ioc_head;
3594 __RWSEM_INITIALIZER(llioc.ioc_sem),
3595 LIST_HEAD_INIT(llioc.ioc_head)
3600 struct list_head iocd_list;
3601 unsigned int iocd_size;
3602 llioc_callback_t iocd_cb;
3603 unsigned int iocd_count;
3604 unsigned int iocd_cmd[0];
3607 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3610 struct llioc_data *in_data = NULL;
3613 if (cb == NULL || cmd == NULL ||
3614 count > LLIOC_MAX_CMD || count < 0)
3617 size = sizeof(*in_data) + count * sizeof(unsigned int);
3618 OBD_ALLOC(in_data, size);
3619 if (in_data == NULL)
3622 memset(in_data, 0, sizeof(*in_data));
3623 in_data->iocd_size = size;
3624 in_data->iocd_cb = cb;
3625 in_data->iocd_count = count;
3626 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3628 down_write(&llioc.ioc_sem);
3629 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3630 up_write(&llioc.ioc_sem);
3635 void ll_iocontrol_unregister(void *magic)
3637 struct llioc_data *tmp;
3642 down_write(&llioc.ioc_sem);
3643 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3645 unsigned int size = tmp->iocd_size;
3647 list_del(&tmp->iocd_list);
3648 up_write(&llioc.ioc_sem);
3650 OBD_FREE(tmp, size);
3654 up_write(&llioc.ioc_sem);
3656 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3659 EXPORT_SYMBOL(ll_iocontrol_register);
3660 EXPORT_SYMBOL(ll_iocontrol_unregister);
3662 static enum llioc_iter
3663 ll_iocontrol_call(struct inode *inode, struct file *file,
3664 unsigned int cmd, unsigned long arg, int *rcp)
3666 enum llioc_iter ret = LLIOC_CONT;
3667 struct llioc_data *data;
3668 int rc = -EINVAL, i;
3670 down_read(&llioc.ioc_sem);
3671 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3672 for (i = 0; i < data->iocd_count; i++) {
3673 if (cmd != data->iocd_cmd[i])
3676 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3680 if (ret == LLIOC_STOP)
3683 up_read(&llioc.ioc_sem);
3690 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3692 struct ll_inode_info *lli = ll_i2info(inode);
3693 struct cl_env_nest nest;
3698 if (lli->lli_clob == NULL)
3701 env = cl_env_nested_get(&nest);
3703 RETURN(PTR_ERR(env));
3705 result = cl_conf_set(env, lli->lli_clob, conf);
3706 cl_env_nested_put(&nest, env);
3708 if (conf->coc_opc == OBJECT_CONF_SET) {
3709 struct ldlm_lock *lock = conf->coc_lock;
3711 LASSERT(lock != NULL);
3712 LASSERT(ldlm_has_layout(lock));
3714 struct lustre_md *md = conf->u.coc_md;
3715 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3717 /* it can only be allowed to match after layout is
3718 * applied to inode otherwise false layout would be
3719 * seen. Applying layout shoud happen before dropping
3720 * the intent lock. */
3721 ldlm_lock_allow_match(lock);
3723 lli->lli_has_smd = lsm_has_objects(md->lsm);
3724 if (md->lsm != NULL)
3725 gen = md->lsm->lsm_layout_gen;
3728 DFID ": layout version change: %u -> %u\n",
3729 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3731 ll_layout_version_set(lli, gen);
3737 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3738 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3741 struct ll_sb_info *sbi = ll_i2sbi(inode);
3742 struct obd_capa *oc;
3743 struct ptlrpc_request *req;
3744 struct mdt_body *body;
3751 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3752 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3753 lock->l_lvb_data, lock->l_lvb_len);
3755 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3758 /* if layout lock was granted right away, the layout is returned
3759 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3760 * blocked and then granted via completion ast, we have to fetch
3761 * layout here. Please note that we can't use the LVB buffer in
3762 * completion AST because it doesn't have a large enough buffer */
3763 oc = ll_mdscapa_get(inode);
3764 rc = ll_get_default_mdsize(sbi, &lmmsize);
3766 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3767 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3773 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3775 GOTO(out, rc = -EPROTO);
3777 lmmsize = body->mbo_eadatasize;
3778 if (lmmsize == 0) /* empty layout */
3781 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3783 GOTO(out, rc = -EFAULT);
3785 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3786 if (lvbdata == NULL)
3787 GOTO(out, rc = -ENOMEM);
3789 memcpy(lvbdata, lmm, lmmsize);
3790 lock_res_and_lock(lock);
3791 if (lock->l_lvb_data != NULL)
3792 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3794 lock->l_lvb_data = lvbdata;
3795 lock->l_lvb_len = lmmsize;
3796 unlock_res_and_lock(lock);
3801 ptlrpc_req_finished(req);
3806 * Apply the layout to the inode. Layout lock is held and will be released
3809 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3810 struct inode *inode, __u32 *gen, bool reconf)
3812 struct ll_inode_info *lli = ll_i2info(inode);
3813 struct ll_sb_info *sbi = ll_i2sbi(inode);
3814 struct ldlm_lock *lock;
3815 struct lustre_md md = { NULL };
3816 struct cl_object_conf conf;
3819 bool wait_layout = false;
3822 LASSERT(lustre_handle_is_used(lockh));
3824 lock = ldlm_handle2lock(lockh);
3825 LASSERT(lock != NULL);
3826 LASSERT(ldlm_has_layout(lock));
3828 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3829 PFID(&lli->lli_fid), inode, reconf);
3831 /* in case this is a caching lock and reinstate with new inode */
3832 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3834 lock_res_and_lock(lock);
3835 lvb_ready = ldlm_is_lvb_ready(lock);
3836 unlock_res_and_lock(lock);
3837 /* checking lvb_ready is racy but this is okay. The worst case is
3838 * that multi processes may configure the file on the same time. */
3840 if (lvb_ready || !reconf) {
3843 /* layout_gen must be valid if layout lock is not
3844 * cancelled and stripe has already set */
3845 *gen = ll_layout_version_get(lli);
3851 rc = ll_layout_fetch(inode, lock);
3855 /* for layout lock, lmm is returned in lock's lvb.
3856 * lvb_data is immutable if the lock is held so it's safe to access it
3857 * without res lock. See the description in ldlm_lock_decref_internal()
3858 * for the condition to free lvb_data of layout lock */
3859 if (lock->l_lvb_data != NULL) {
3860 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3861 lock->l_lvb_data, lock->l_lvb_len);
3863 *gen = LL_LAYOUT_GEN_EMPTY;
3865 *gen = md.lsm->lsm_layout_gen;
3868 CERROR("%s: file "DFID" unpackmd error: %d\n",
3869 ll_get_fsname(inode->i_sb, NULL, 0),
3870 PFID(&lli->lli_fid), rc);
3876 /* set layout to file. Unlikely this will fail as old layout was
3877 * surely eliminated */
3878 memset(&conf, 0, sizeof conf);
3879 conf.coc_opc = OBJECT_CONF_SET;
3880 conf.coc_inode = inode;
3881 conf.coc_lock = lock;
3882 conf.u.coc_md = &md;
3883 rc = ll_layout_conf(inode, &conf);
3886 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3888 /* refresh layout failed, need to wait */
3889 wait_layout = rc == -EBUSY;
3893 LDLM_LOCK_PUT(lock);
3894 ldlm_lock_decref(lockh, mode);
3896 /* wait for IO to complete if it's still being used. */
3898 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3899 ll_get_fsname(inode->i_sb, NULL, 0),
3900 PFID(&lli->lli_fid), inode);
3902 memset(&conf, 0, sizeof conf);
3903 conf.coc_opc = OBJECT_CONF_WAIT;
3904 conf.coc_inode = inode;
3905 rc = ll_layout_conf(inode, &conf);
3909 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3910 ll_get_fsname(inode->i_sb, NULL, 0),
3911 PFID(&lli->lli_fid), rc);
3917 * This function checks if there exists a LAYOUT lock on the client side,
3918 * or enqueues it if it doesn't have one in cache.
3920 * This function will not hold layout lock so it may be revoked any time after
3921 * this function returns. Any operations depend on layout should be redone
3924 * This function should be called before lov_io_init() to get an uptodate
3925 * layout version, the caller should save the version number and after IO
3926 * is finished, this function should be called again to verify that layout
3927 * is not changed during IO time.
3929 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3931 struct ll_inode_info *lli = ll_i2info(inode);
3932 struct ll_sb_info *sbi = ll_i2sbi(inode);
3933 struct md_op_data *op_data;
3934 struct lookup_intent it;
3935 struct lustre_handle lockh;
3937 struct ldlm_enqueue_info einfo = {
3938 .ei_type = LDLM_IBITS,
3940 .ei_cb_bl = &ll_md_blocking_ast,
3941 .ei_cb_cp = &ldlm_completion_ast,
3946 *gen = ll_layout_version_get(lli);
3947 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3951 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3952 LASSERT(S_ISREG(inode->i_mode));
3954 /* take layout lock mutex to enqueue layout lock exclusively. */
3955 mutex_lock(&lli->lli_layout_mutex);
3958 /* mostly layout lock is caching on the local side, so try to match
3959 * it before grabbing layout lock mutex. */
3960 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3961 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3962 if (mode != 0) { /* hit cached lock */
3963 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3967 mutex_unlock(&lli->lli_layout_mutex);
3971 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3972 0, 0, LUSTRE_OPC_ANY, NULL);
3973 if (IS_ERR(op_data)) {
3974 mutex_unlock(&lli->lli_layout_mutex);
3975 RETURN(PTR_ERR(op_data));
3978 /* have to enqueue one */
3979 memset(&it, 0, sizeof(it));
3980 it.it_op = IT_LAYOUT;
3981 lockh.cookie = 0ULL;
3983 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3984 ll_get_fsname(inode->i_sb, NULL, 0),
3985 PFID(&lli->lli_fid), inode);
3987 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3988 if (it.d.lustre.it_data != NULL)
3989 ptlrpc_req_finished(it.d.lustre.it_data);
3990 it.d.lustre.it_data = NULL;
3992 ll_finish_md_op_data(op_data);
3994 mode = it.d.lustre.it_lock_mode;
3995 it.d.lustre.it_lock_mode = 0;
3996 ll_intent_drop_lock(&it);
3999 /* set lock data in case this is a new lock */
4000 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4001 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4005 mutex_unlock(&lli->lli_layout_mutex);
4011 * This function send a restore request to the MDT
4013 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4015 struct hsm_user_request *hur;
4019 len = sizeof(struct hsm_user_request) +
4020 sizeof(struct hsm_user_item);
4021 OBD_ALLOC(hur, len);
4025 hur->hur_request.hr_action = HUA_RESTORE;
4026 hur->hur_request.hr_archive_id = 0;
4027 hur->hur_request.hr_flags = 0;
4028 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4029 sizeof(hur->hur_user_item[0].hui_fid));
4030 hur->hur_user_item[0].hui_extent.offset = offset;
4031 hur->hur_user_item[0].hui_extent.length = length;
4032 hur->hur_request.hr_itemcount = 1;
4033 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,