4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1026 /* merge timestamps the most recently obtained from mds with
1027 timestamps obtained from osts */
1028 LTIME_S(inode->i_atime) = lli->lli_atime;
1029 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1030 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1032 atime = LTIME_S(inode->i_atime);
1033 mtime = LTIME_S(inode->i_mtime);
1034 ctime = LTIME_S(inode->i_ctime);
1036 cl_object_attr_lock(obj);
1037 rc = cl_object_attr_get(env, obj, attr);
1038 cl_object_attr_unlock(obj);
1041 GOTO(out_size_unlock, rc);
1043 if (atime < attr->cat_atime)
1044 atime = attr->cat_atime;
1046 if (ctime < attr->cat_ctime)
1047 ctime = attr->cat_ctime;
1049 if (mtime < attr->cat_mtime)
1050 mtime = attr->cat_mtime;
1052 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1053 PFID(&lli->lli_fid), attr->cat_size);
1055 cl_isize_write_nolock(inode, attr->cat_size);
1056 inode->i_blocks = attr->cat_blocks;
1058 LTIME_S(inode->i_atime) = atime;
1059 LTIME_S(inode->i_mtime) = mtime;
1060 LTIME_S(inode->i_ctime) = ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1144 struct range_lock range;
1147 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1148 file->f_dentry->d_name.name, iot, *ppos, count);
1151 io = ccc_env_thread_io(env);
1152 ll_io_init(io, file, iot == CIT_WRITE);
1154 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1155 struct vvp_io *vio = vvp_env_io(env);
1156 struct ccc_io *cio = ccc_env_io(env);
1157 bool range_locked = false;
1159 if (file->f_flags & O_APPEND)
1160 range_lock_init(&range, 0, LUSTRE_EOF);
1162 range_lock_init(&range, *ppos, *ppos + count - 1);
1163 cio->cui_fd = LUSTRE_FPRIVATE(file);
1164 vio->cui_io_subtype = args->via_io_subtype;
1166 switch (vio->cui_io_subtype) {
1168 cio->cui_iov = args->u.normal.via_iov;
1169 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1170 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1171 cio->cui_iocb = args->u.normal.via_iocb;
1172 if ((iot == CIT_WRITE) &&
1173 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1174 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1176 result = range_lock(&lli->lli_write_tree,
1181 range_locked = true;
1183 down_read(&lli->lli_trunc_sem);
1186 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1187 vio->u.splice.cui_flags = args->u.splice.via_flags;
1190 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1194 ll_cl_add(file, env, io);
1195 result = cl_io_loop(env, io);
1196 ll_cl_remove(file, env);
1198 if (args->via_io_subtype == IO_NORMAL)
1199 up_read(&lli->lli_trunc_sem);
1201 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1203 range_unlock(&lli->lli_write_tree, &range);
1206 /* cl_io_rw_init() handled IO */
1207 result = io->ci_result;
1210 if (io->ci_nob > 0) {
1211 result = io->ci_nob;
1212 *ppos = io->u.ci_wr.wr.crw_pos;
1216 cl_io_fini(env, io);
1217 /* If any bit been read/written (result != 0), we just return
1218 * short read/write instead of restart io. */
1219 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1220 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1221 iot == CIT_READ ? "read" : "write",
1222 file->f_dentry->d_name.name, *ppos, count);
1223 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1227 if (iot == CIT_READ) {
1229 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1230 LPROC_LL_READ_BYTES, result);
1231 } else if (iot == CIT_WRITE) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_WRITE_BYTES, result);
1235 fd->fd_write_failed = false;
1236 } else if (result != -ERESTARTSYS) {
1237 fd->fd_write_failed = true;
1240 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1247 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1249 static int ll_file_get_iov_count(const struct iovec *iov,
1250 unsigned long *nr_segs, size_t *count)
1255 for (seg = 0; seg < *nr_segs; seg++) {
1256 const struct iovec *iv = &iov[seg];
1259 * If any segment has a negative length, or the cumulative
1260 * length ever wraps negative then return -EINVAL.
1263 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1265 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1270 cnt -= iv->iov_len; /* This segment is no good */
1277 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1278 unsigned long nr_segs, loff_t pos)
1281 struct vvp_io_args *args;
1287 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1291 env = cl_env_get(&refcheck);
1293 RETURN(PTR_ERR(env));
1295 args = vvp_env_args(env, IO_NORMAL);
1296 args->u.normal.via_iov = (struct iovec *)iov;
1297 args->u.normal.via_nrsegs = nr_segs;
1298 args->u.normal.via_iocb = iocb;
1300 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1301 &iocb->ki_pos, count);
1302 cl_env_put(env, &refcheck);
1306 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1310 struct iovec *local_iov;
1311 struct kiocb *kiocb;
1316 env = cl_env_get(&refcheck);
1318 RETURN(PTR_ERR(env));
1320 local_iov = &vvp_env_info(env)->vti_local_iov;
1321 kiocb = &vvp_env_info(env)->vti_kiocb;
1322 local_iov->iov_base = (void __user *)buf;
1323 local_iov->iov_len = count;
1324 init_sync_kiocb(kiocb, file);
1325 kiocb->ki_pos = *ppos;
1326 #ifdef HAVE_KIOCB_KI_LEFT
1327 kiocb->ki_left = count;
1329 kiocb->ki_nbytes = count;
1332 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1333 *ppos = kiocb->ki_pos;
1335 cl_env_put(env, &refcheck);
1340 * Write to a file (through the page cache).
1343 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1344 unsigned long nr_segs, loff_t pos)
1347 struct vvp_io_args *args;
1353 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1357 env = cl_env_get(&refcheck);
1359 RETURN(PTR_ERR(env));
1361 args = vvp_env_args(env, IO_NORMAL);
1362 args->u.normal.via_iov = (struct iovec *)iov;
1363 args->u.normal.via_nrsegs = nr_segs;
1364 args->u.normal.via_iocb = iocb;
1366 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1367 &iocb->ki_pos, count);
1368 cl_env_put(env, &refcheck);
1372 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1373 size_t count, loff_t *ppos)
1376 struct iovec *local_iov;
1377 struct kiocb *kiocb;
1382 env = cl_env_get(&refcheck);
1384 RETURN(PTR_ERR(env));
1386 local_iov = &vvp_env_info(env)->vti_local_iov;
1387 kiocb = &vvp_env_info(env)->vti_kiocb;
1388 local_iov->iov_base = (void __user *)buf;
1389 local_iov->iov_len = count;
1390 init_sync_kiocb(kiocb, file);
1391 kiocb->ki_pos = *ppos;
1392 #ifdef HAVE_KIOCB_KI_LEFT
1393 kiocb->ki_left = count;
1395 kiocb->ki_nbytes = count;
1398 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1399 *ppos = kiocb->ki_pos;
1401 cl_env_put(env, &refcheck);
1406 * Send file content (through pagecache) somewhere with helper
1408 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1409 struct pipe_inode_info *pipe, size_t count,
1413 struct vvp_io_args *args;
1418 env = cl_env_get(&refcheck);
1420 RETURN(PTR_ERR(env));
1422 args = vvp_env_args(env, IO_SPLICE);
1423 args->u.splice.via_pipe = pipe;
1424 args->u.splice.via_flags = flags;
1426 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1427 cl_env_put(env, &refcheck);
1431 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1432 __u64 flags, struct lov_user_md *lum,
1435 struct lov_stripe_md *lsm = NULL;
1436 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1440 lsm = ccc_inode_lsm_get(inode);
1442 ccc_inode_lsm_put(inode, lsm);
1443 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1444 PFID(ll_inode2fid(inode)));
1445 GOTO(out, rc = -EEXIST);
1448 ll_inode_size_lock(inode);
1449 oit.it_flags |= MDS_OPEN_BY_FID;
1450 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1452 GOTO(out_unlock, rc);
1453 rc = oit.d.lustre.it_status;
1455 GOTO(out_req_free, rc);
1457 ll_release_openhandle(file->f_dentry, &oit);
1460 ll_inode_size_unlock(inode);
1461 ll_intent_release(&oit);
1462 ccc_inode_lsm_put(inode, lsm);
1464 cl_lov_delay_create_clear(&file->f_flags);
1467 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1471 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1472 struct lov_mds_md **lmmp, int *lmm_size,
1473 struct ptlrpc_request **request)
1475 struct ll_sb_info *sbi = ll_i2sbi(inode);
1476 struct mdt_body *body;
1477 struct lov_mds_md *lmm = NULL;
1478 struct ptlrpc_request *req = NULL;
1479 struct md_op_data *op_data;
1482 rc = ll_get_default_mdsize(sbi, &lmmsize);
1486 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1487 strlen(filename), lmmsize,
1488 LUSTRE_OPC_ANY, NULL);
1489 if (IS_ERR(op_data))
1490 RETURN(PTR_ERR(op_data));
1492 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1493 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1494 ll_finish_md_op_data(op_data);
1496 CDEBUG(D_INFO, "md_getattr_name failed "
1497 "on %s: rc %d\n", filename, rc);
1501 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1502 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1504 lmmsize = body->mbo_eadatasize;
1506 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1508 GOTO(out, rc = -ENODATA);
1511 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1512 LASSERT(lmm != NULL);
1514 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1515 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1516 GOTO(out, rc = -EPROTO);
1520 * This is coming from the MDS, so is probably in
1521 * little endian. We convert it to host endian before
1522 * passing it to userspace.
1524 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1527 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1528 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1531 /* if function called for directory - we should
1532 * avoid swab not existent lsm objects */
1533 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1534 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1535 if (S_ISREG(body->mbo_mode))
1536 lustre_swab_lov_user_md_objects(
1537 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1539 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1540 lustre_swab_lov_user_md_v3(
1541 (struct lov_user_md_v3 *)lmm);
1542 if (S_ISREG(body->mbo_mode))
1543 lustre_swab_lov_user_md_objects(
1544 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1551 *lmm_size = lmmsize;
1556 static int ll_lov_setea(struct inode *inode, struct file *file,
1559 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1560 struct lov_user_md *lump;
1561 int lum_size = sizeof(struct lov_user_md) +
1562 sizeof(struct lov_user_ost_data);
1566 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1569 OBD_ALLOC_LARGE(lump, lum_size);
1573 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1574 OBD_FREE_LARGE(lump, lum_size);
1578 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1580 OBD_FREE_LARGE(lump, lum_size);
1584 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1587 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1588 struct lov_user_md *klum;
1590 __u64 flags = FMODE_WRITE;
1593 rc = ll_copy_user_md(lum, &klum);
1598 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1600 struct lov_stripe_md *lsm;
1603 put_user(0, &lum->lmm_stripe_count);
1605 ll_layout_refresh(inode, &gen);
1606 lsm = ccc_inode_lsm_get(inode);
1607 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1609 ccc_inode_lsm_put(inode, lsm);
1612 OBD_FREE(klum, lum_size);
1616 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1618 struct lov_stripe_md *lsm;
1622 lsm = ccc_inode_lsm_get(inode);
1624 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1625 lsm, (void __user *)arg);
1626 ccc_inode_lsm_put(inode, lsm);
1631 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1633 struct ll_inode_info *lli = ll_i2info(inode);
1634 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1635 struct ccc_grouplock grouplock;
1640 CWARN("group id for group lock must not be 0\n");
1644 if (ll_file_nolock(file))
1645 RETURN(-EOPNOTSUPP);
1647 spin_lock(&lli->lli_lock);
1648 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1649 CWARN("group lock already existed with gid %lu\n",
1650 fd->fd_grouplock.cg_gid);
1651 spin_unlock(&lli->lli_lock);
1654 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1655 spin_unlock(&lli->lli_lock);
1657 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1658 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1662 spin_lock(&lli->lli_lock);
1663 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1664 spin_unlock(&lli->lli_lock);
1665 CERROR("another thread just won the race\n");
1666 cl_put_grouplock(&grouplock);
1670 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1671 fd->fd_grouplock = grouplock;
1672 spin_unlock(&lli->lli_lock);
1674 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1678 static int ll_put_grouplock(struct inode *inode, struct file *file,
1681 struct ll_inode_info *lli = ll_i2info(inode);
1682 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1683 struct ccc_grouplock grouplock;
1686 spin_lock(&lli->lli_lock);
1687 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1688 spin_unlock(&lli->lli_lock);
1689 CWARN("no group lock held\n");
1692 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1694 if (fd->fd_grouplock.cg_gid != arg) {
1695 CWARN("group lock %lu doesn't match current id %lu\n",
1696 arg, fd->fd_grouplock.cg_gid);
1697 spin_unlock(&lli->lli_lock);
1701 grouplock = fd->fd_grouplock;
1702 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1703 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1704 spin_unlock(&lli->lli_lock);
1706 cl_put_grouplock(&grouplock);
1707 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1712 * Close inode open handle
1714 * \param dentry [in] dentry which contains the inode
1715 * \param it [in,out] intent which contains open info and result
1718 * \retval <0 failure
1720 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1722 struct inode *inode = dentry->d_inode;
1723 struct obd_client_handle *och;
1729 /* Root ? Do nothing. */
1730 if (dentry->d_inode->i_sb->s_root == dentry)
1733 /* No open handle to close? Move away */
1734 if (!it_disposition(it, DISP_OPEN_OPEN))
1737 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1739 OBD_ALLOC(och, sizeof(*och));
1741 GOTO(out, rc = -ENOMEM);
1743 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1745 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1748 /* this one is in place of ll_file_open */
1749 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1750 ptlrpc_req_finished(it->d.lustre.it_data);
1751 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1757 * Get size for inode for which FIEMAP mapping is requested.
1758 * Make the FIEMAP get_info call and returns the result.
1760 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1763 struct obd_export *exp = ll_i2dtexp(inode);
1764 struct lov_stripe_md *lsm = NULL;
1765 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1766 __u32 vallen = num_bytes;
1770 /* Checks for fiemap flags */
1771 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1772 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1776 /* Check for FIEMAP_FLAG_SYNC */
1777 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1778 rc = filemap_fdatawrite(inode->i_mapping);
1783 lsm = ccc_inode_lsm_get(inode);
1787 /* If the stripe_count > 1 and the application does not understand
1788 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1790 if (lsm->lsm_stripe_count > 1 &&
1791 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1792 GOTO(out, rc = -EOPNOTSUPP);
1794 fm_key.oa.o_oi = lsm->lsm_oi;
1795 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1797 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1798 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1799 /* If filesize is 0, then there would be no objects for mapping */
1800 if (fm_key.oa.o_size == 0) {
1801 fiemap->fm_mapped_extents = 0;
1805 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1807 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1810 CERROR("obd_get_info failed: rc = %d\n", rc);
1813 ccc_inode_lsm_put(inode, lsm);
1817 int ll_fid2path(struct inode *inode, void __user *arg)
1819 struct obd_export *exp = ll_i2mdexp(inode);
1820 const struct getinfo_fid2path __user *gfin = arg;
1822 struct getinfo_fid2path *gfout;
1828 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1829 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1832 /* Only need to get the buflen */
1833 if (get_user(pathlen, &gfin->gf_pathlen))
1836 if (pathlen > PATH_MAX)
1839 outsize = sizeof(*gfout) + pathlen;
1840 OBD_ALLOC(gfout, outsize);
1844 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1845 GOTO(gf_free, rc = -EFAULT);
1847 /* Call mdc_iocontrol */
1848 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1852 if (copy_to_user(arg, gfout, outsize))
1856 OBD_FREE(gfout, outsize);
1860 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1862 struct ll_user_fiemap *fiemap_s;
1863 size_t num_bytes, ret_bytes;
1864 unsigned int extent_count;
1867 /* Get the extent count so we can calculate the size of
1868 * required fiemap buffer */
1869 if (get_user(extent_count,
1870 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1874 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1876 num_bytes = sizeof(*fiemap_s) + (extent_count *
1877 sizeof(struct ll_fiemap_extent));
1879 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1880 if (fiemap_s == NULL)
1883 /* get the fiemap value */
1884 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1886 GOTO(error, rc = -EFAULT);
1888 /* If fm_extent_count is non-zero, read the first extent since
1889 * it is used to calculate end_offset and device from previous
1892 if (copy_from_user(&fiemap_s->fm_extents[0],
1893 (char __user *)arg + sizeof(*fiemap_s),
1894 sizeof(struct ll_fiemap_extent)))
1895 GOTO(error, rc = -EFAULT);
1898 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1902 ret_bytes = sizeof(struct ll_user_fiemap);
1904 if (extent_count != 0)
1905 ret_bytes += (fiemap_s->fm_mapped_extents *
1906 sizeof(struct ll_fiemap_extent));
1908 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1912 OBD_FREE_LARGE(fiemap_s, num_bytes);
1917 * Read the data_version for inode.
1919 * This value is computed using stripe object version on OST.
1920 * Version is computed using server side locking.
1922 * @param sync if do sync on the OST side;
1924 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1925 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1927 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1929 struct lov_stripe_md *lsm = NULL;
1930 struct ll_sb_info *sbi = ll_i2sbi(inode);
1931 struct obdo *obdo = NULL;
1935 /* If no stripe, we consider version is 0. */
1936 lsm = ccc_inode_lsm_get(inode);
1937 if (!lsm_has_objects(lsm)) {
1939 CDEBUG(D_INODE, "No object for inode\n");
1943 OBD_ALLOC_PTR(obdo);
1945 GOTO(out, rc = -ENOMEM);
1947 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1949 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1952 *data_version = obdo->o_data_version;
1958 ccc_inode_lsm_put(inode, lsm);
1963 * Trigger a HSM release request for the provided inode.
1965 int ll_hsm_release(struct inode *inode)
1967 struct cl_env_nest nest;
1969 struct obd_client_handle *och = NULL;
1970 __u64 data_version = 0;
1974 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1975 ll_get_fsname(inode->i_sb, NULL, 0),
1976 PFID(&ll_i2info(inode)->lli_fid));
1978 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1980 GOTO(out, rc = PTR_ERR(och));
1982 /* Grab latest data_version and [am]time values */
1983 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1987 env = cl_env_nested_get(&nest);
1989 GOTO(out, rc = PTR_ERR(env));
1991 ll_merge_attr(env, inode);
1992 cl_env_nested_put(&nest, env);
1994 /* Release the file.
1995 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1996 * we still need it to pack l_remote_handle to MDT. */
1997 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2003 if (och != NULL && !IS_ERR(och)) /* close the file */
2004 ll_lease_close(och, inode, NULL);
2009 struct ll_swap_stack {
2010 struct iattr ia1, ia2;
2012 struct inode *inode1, *inode2;
2013 bool check_dv1, check_dv2;
2016 static int ll_swap_layouts(struct file *file1, struct file *file2,
2017 struct lustre_swap_layouts *lsl)
2019 struct mdc_swap_layouts msl;
2020 struct md_op_data *op_data;
2023 struct ll_swap_stack *llss = NULL;
2026 OBD_ALLOC_PTR(llss);
2030 llss->inode1 = file1->f_dentry->d_inode;
2031 llss->inode2 = file2->f_dentry->d_inode;
2033 if (!S_ISREG(llss->inode2->i_mode))
2034 GOTO(free, rc = -EINVAL);
2036 if (inode_permission(llss->inode1, MAY_WRITE) ||
2037 inode_permission(llss->inode2, MAY_WRITE))
2038 GOTO(free, rc = -EPERM);
2040 if (llss->inode2->i_sb != llss->inode1->i_sb)
2041 GOTO(free, rc = -EXDEV);
2043 /* we use 2 bool because it is easier to swap than 2 bits */
2044 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2045 llss->check_dv1 = true;
2047 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2048 llss->check_dv2 = true;
2050 /* we cannot use lsl->sl_dvX directly because we may swap them */
2051 llss->dv1 = lsl->sl_dv1;
2052 llss->dv2 = lsl->sl_dv2;
2054 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2055 if (rc == 0) /* same file, done! */
2058 if (rc < 0) { /* sequentialize it */
2059 swap(llss->inode1, llss->inode2);
2061 swap(llss->dv1, llss->dv2);
2062 swap(llss->check_dv1, llss->check_dv2);
2066 if (gid != 0) { /* application asks to flush dirty cache */
2067 rc = ll_get_grouplock(llss->inode1, file1, gid);
2071 rc = ll_get_grouplock(llss->inode2, file2, gid);
2073 ll_put_grouplock(llss->inode1, file1, gid);
2078 /* to be able to restore mtime and atime after swap
2079 * we need to first save them */
2081 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2082 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2083 llss->ia1.ia_atime = llss->inode1->i_atime;
2084 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2085 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2086 llss->ia2.ia_atime = llss->inode2->i_atime;
2087 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2090 /* ultimate check, before swaping the layouts we check if
2091 * dataversion has changed (if requested) */
2092 if (llss->check_dv1) {
2093 rc = ll_data_version(llss->inode1, &dv, 0);
2096 if (dv != llss->dv1)
2097 GOTO(putgl, rc = -EAGAIN);
2100 if (llss->check_dv2) {
2101 rc = ll_data_version(llss->inode2, &dv, 0);
2104 if (dv != llss->dv2)
2105 GOTO(putgl, rc = -EAGAIN);
2108 /* struct md_op_data is used to send the swap args to the mdt
2109 * only flags is missing, so we use struct mdc_swap_layouts
2110 * through the md_op_data->op_data */
2111 /* flags from user space have to be converted before they are send to
2112 * server, no flag is sent today, they are only used on the client */
2115 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2116 0, LUSTRE_OPC_ANY, &msl);
2117 if (IS_ERR(op_data))
2118 GOTO(free, rc = PTR_ERR(op_data));
2120 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2121 sizeof(*op_data), op_data, NULL);
2122 ll_finish_md_op_data(op_data);
2126 ll_put_grouplock(llss->inode2, file2, gid);
2127 ll_put_grouplock(llss->inode1, file1, gid);
2130 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2134 /* clear useless flags */
2135 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2136 llss->ia1.ia_valid &= ~ATTR_MTIME;
2137 llss->ia2.ia_valid &= ~ATTR_MTIME;
2140 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2141 llss->ia1.ia_valid &= ~ATTR_ATIME;
2142 llss->ia2.ia_valid &= ~ATTR_ATIME;
2145 /* update time if requested */
2147 if (llss->ia2.ia_valid != 0) {
2148 mutex_lock(&llss->inode1->i_mutex);
2149 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2150 mutex_unlock(&llss->inode1->i_mutex);
2153 if (llss->ia1.ia_valid != 0) {
2156 mutex_lock(&llss->inode2->i_mutex);
2157 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2158 mutex_unlock(&llss->inode2->i_mutex);
2170 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2172 struct md_op_data *op_data;
2175 /* Non-root users are forbidden to set or clear flags which are
2176 * NOT defined in HSM_USER_MASK. */
2177 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2178 !cfs_capable(CFS_CAP_SYS_ADMIN))
2181 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2182 LUSTRE_OPC_ANY, hss);
2183 if (IS_ERR(op_data))
2184 RETURN(PTR_ERR(op_data));
2186 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2187 sizeof(*op_data), op_data, NULL);
2189 ll_finish_md_op_data(op_data);
2194 static int ll_hsm_import(struct inode *inode, struct file *file,
2195 struct hsm_user_import *hui)
2197 struct hsm_state_set *hss = NULL;
2198 struct iattr *attr = NULL;
2202 if (!S_ISREG(inode->i_mode))
2208 GOTO(out, rc = -ENOMEM);
2210 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2211 hss->hss_archive_id = hui->hui_archive_id;
2212 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2213 rc = ll_hsm_state_set(inode, hss);
2217 OBD_ALLOC_PTR(attr);
2219 GOTO(out, rc = -ENOMEM);
2221 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2222 attr->ia_mode |= S_IFREG;
2223 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2224 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2225 attr->ia_size = hui->hui_size;
2226 attr->ia_mtime.tv_sec = hui->hui_mtime;
2227 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2228 attr->ia_atime.tv_sec = hui->hui_atime;
2229 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2231 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2232 ATTR_UID | ATTR_GID |
2233 ATTR_MTIME | ATTR_MTIME_SET |
2234 ATTR_ATIME | ATTR_ATIME_SET;
2236 mutex_lock(&inode->i_mutex);
2238 rc = ll_setattr_raw(file->f_dentry, attr, true);
2242 mutex_unlock(&inode->i_mutex);
2254 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2256 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2257 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2261 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2263 struct inode *inode = file->f_dentry->d_inode;
2264 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2268 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2269 PFID(ll_inode2fid(inode)), inode, cmd);
2270 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2272 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2273 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2277 case LL_IOC_GETFLAGS:
2278 /* Get the current value of the file flags */
2279 return put_user(fd->fd_flags, (int __user *)arg);
2280 case LL_IOC_SETFLAGS:
2281 case LL_IOC_CLRFLAGS:
2282 /* Set or clear specific file flags */
2283 /* XXX This probably needs checks to ensure the flags are
2284 * not abused, and to handle any flag side effects.
2286 if (get_user(flags, (int __user *) arg))
2289 if (cmd == LL_IOC_SETFLAGS) {
2290 if ((flags & LL_FILE_IGNORE_LOCK) &&
2291 !(file->f_flags & O_DIRECT)) {
2292 CERROR("%s: unable to disable locking on "
2293 "non-O_DIRECT file\n", current->comm);
2297 fd->fd_flags |= flags;
2299 fd->fd_flags &= ~flags;
2302 case LL_IOC_LOV_SETSTRIPE:
2303 RETURN(ll_lov_setstripe(inode, file, arg));
2304 case LL_IOC_LOV_SETEA:
2305 RETURN(ll_lov_setea(inode, file, arg));
2306 case LL_IOC_LOV_SWAP_LAYOUTS: {
2308 struct lustre_swap_layouts lsl;
2310 if (copy_from_user(&lsl, (char __user *)arg,
2311 sizeof(struct lustre_swap_layouts)))
2314 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2317 file2 = fget(lsl.sl_fd);
2322 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2323 rc = ll_swap_layouts(file, file2, &lsl);
2327 case LL_IOC_LOV_GETSTRIPE:
2328 RETURN(ll_lov_getstripe(inode, arg));
2329 case FSFILT_IOC_FIEMAP:
2330 RETURN(ll_ioctl_fiemap(inode, arg));
2331 case FSFILT_IOC_GETFLAGS:
2332 case FSFILT_IOC_SETFLAGS:
2333 RETURN(ll_iocontrol(inode, file, cmd, arg));
2334 case FSFILT_IOC_GETVERSION_OLD:
2335 case FSFILT_IOC_GETVERSION:
2336 RETURN(put_user(inode->i_generation, (int __user *)arg));
2337 case LL_IOC_GROUP_LOCK:
2338 RETURN(ll_get_grouplock(inode, file, arg));
2339 case LL_IOC_GROUP_UNLOCK:
2340 RETURN(ll_put_grouplock(inode, file, arg));
2341 case IOC_OBD_STATFS:
2342 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2344 /* We need to special case any other ioctls we want to handle,
2345 * to send them to the MDS/OST as appropriate and to properly
2346 * network encode the arg field.
2347 case FSFILT_IOC_SETVERSION_OLD:
2348 case FSFILT_IOC_SETVERSION:
2350 case LL_IOC_FLUSHCTX:
2351 RETURN(ll_flush_ctx(inode));
2352 case LL_IOC_PATH2FID: {
2353 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2354 sizeof(struct lu_fid)))
2359 case LL_IOC_GETPARENT:
2360 RETURN(ll_getparent(file, (void __user *)arg));
2362 case OBD_IOC_FID2PATH:
2363 RETURN(ll_fid2path(inode, (void __user *)arg));
2364 case LL_IOC_DATA_VERSION: {
2365 struct ioc_data_version idv;
2368 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2371 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2372 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2375 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2381 case LL_IOC_GET_MDTIDX: {
2384 mdtidx = ll_get_mdt_idx(inode);
2388 if (put_user((int)mdtidx, (int __user *)arg))
2393 case OBD_IOC_GETDTNAME:
2394 case OBD_IOC_GETMDNAME:
2395 RETURN(ll_get_obd_name(inode, cmd, arg));
2396 case LL_IOC_HSM_STATE_GET: {
2397 struct md_op_data *op_data;
2398 struct hsm_user_state *hus;
2405 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2406 LUSTRE_OPC_ANY, hus);
2407 if (IS_ERR(op_data)) {
2409 RETURN(PTR_ERR(op_data));
2412 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2415 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2418 ll_finish_md_op_data(op_data);
2422 case LL_IOC_HSM_STATE_SET: {
2423 struct hsm_state_set *hss;
2430 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2435 rc = ll_hsm_state_set(inode, hss);
2440 case LL_IOC_HSM_ACTION: {
2441 struct md_op_data *op_data;
2442 struct hsm_current_action *hca;
2449 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2450 LUSTRE_OPC_ANY, hca);
2451 if (IS_ERR(op_data)) {
2453 RETURN(PTR_ERR(op_data));
2456 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2459 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2462 ll_finish_md_op_data(op_data);
2466 case LL_IOC_SET_LEASE: {
2467 struct ll_inode_info *lli = ll_i2info(inode);
2468 struct obd_client_handle *och = NULL;
2473 case LL_LEASE_WRLCK:
2474 if (!(file->f_mode & FMODE_WRITE))
2476 fmode = FMODE_WRITE;
2478 case LL_LEASE_RDLCK:
2479 if (!(file->f_mode & FMODE_READ))
2483 case LL_LEASE_UNLCK:
2484 mutex_lock(&lli->lli_och_mutex);
2485 if (fd->fd_lease_och != NULL) {
2486 och = fd->fd_lease_och;
2487 fd->fd_lease_och = NULL;
2489 mutex_unlock(&lli->lli_och_mutex);
2494 fmode = och->och_flags;
2495 rc = ll_lease_close(och, inode, &lease_broken);
2502 RETURN(ll_lease_type_from_fmode(fmode));
2507 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2509 /* apply for lease */
2510 och = ll_lease_open(inode, file, fmode, 0);
2512 RETURN(PTR_ERR(och));
2515 mutex_lock(&lli->lli_och_mutex);
2516 if (fd->fd_lease_och == NULL) {
2517 fd->fd_lease_och = och;
2520 mutex_unlock(&lli->lli_och_mutex);
2522 /* impossible now that only excl is supported for now */
2523 ll_lease_close(och, inode, &lease_broken);
2528 case LL_IOC_GET_LEASE: {
2529 struct ll_inode_info *lli = ll_i2info(inode);
2530 struct ldlm_lock *lock = NULL;
2533 mutex_lock(&lli->lli_och_mutex);
2534 if (fd->fd_lease_och != NULL) {
2535 struct obd_client_handle *och = fd->fd_lease_och;
2537 lock = ldlm_handle2lock(&och->och_lease_handle);
2539 lock_res_and_lock(lock);
2540 if (!ldlm_is_cancel(lock))
2541 fmode = och->och_flags;
2543 unlock_res_and_lock(lock);
2544 LDLM_LOCK_PUT(lock);
2547 mutex_unlock(&lli->lli_och_mutex);
2549 RETURN(ll_lease_type_from_fmode(fmode));
2551 case LL_IOC_HSM_IMPORT: {
2552 struct hsm_user_import *hui;
2558 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2563 rc = ll_hsm_import(inode, file, hui);
2573 ll_iocontrol_call(inode, file, cmd, arg, &err))
2576 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2577 (void __user *)arg));
2582 #ifndef HAVE_FILE_LLSEEK_SIZE
2583 static inline loff_t
2584 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2586 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2588 if (offset > maxsize)
2591 if (offset != file->f_pos) {
2592 file->f_pos = offset;
2593 file->f_version = 0;
2599 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2600 loff_t maxsize, loff_t eof)
2602 struct inode *inode = file->f_dentry->d_inode;
2610 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2611 * position-querying operation. Avoid rewriting the "same"
2612 * f_pos value back to the file because a concurrent read(),
2613 * write() or lseek() might have altered it
2618 * f_lock protects against read/modify/write race with other
2619 * SEEK_CURs. Note that parallel writes and reads behave
2622 mutex_lock(&inode->i_mutex);
2623 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2624 mutex_unlock(&inode->i_mutex);
2628 * In the generic case the entire file is data, so as long as
2629 * offset isn't at the end of the file then the offset is data.
2636 * There is a virtual hole at the end of the file, so as long as
2637 * offset isn't i_size or larger, return i_size.
2645 return llseek_execute(file, offset, maxsize);
2649 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2651 struct inode *inode = file->f_dentry->d_inode;
2652 loff_t retval, eof = 0;
2655 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2656 (origin == SEEK_CUR) ? file->f_pos : 0);
2657 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2658 PFID(ll_inode2fid(inode)), inode, retval, retval,
2660 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2662 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2663 retval = ll_glimpse_size(inode);
2666 eof = i_size_read(inode);
2669 retval = ll_generic_file_llseek_size(file, offset, origin,
2670 ll_file_maxbytes(inode), eof);
2674 static int ll_flush(struct file *file, fl_owner_t id)
2676 struct inode *inode = file->f_dentry->d_inode;
2677 struct ll_inode_info *lli = ll_i2info(inode);
2678 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2681 LASSERT(!S_ISDIR(inode->i_mode));
2683 /* catch async errors that were recorded back when async writeback
2684 * failed for pages in this mapping. */
2685 rc = lli->lli_async_rc;
2686 lli->lli_async_rc = 0;
2687 if (lli->lli_clob != NULL) {
2688 err = lov_read_and_clear_async_rc(lli->lli_clob);
2693 /* The application has been told write failure already.
2694 * Do not report failure again. */
2695 if (fd->fd_write_failed)
2697 return rc ? -EIO : 0;
2701 * Called to make sure a portion of file has been written out.
2702 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2704 * Return how many pages have been written.
2706 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2707 enum cl_fsync_mode mode, int ignore_layout)
2709 struct cl_env_nest nest;
2712 struct obd_capa *capa = NULL;
2713 struct cl_fsync_io *fio;
2717 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2718 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2721 env = cl_env_nested_get(&nest);
2723 RETURN(PTR_ERR(env));
2725 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2727 io = ccc_env_thread_io(env);
2728 io->ci_obj = cl_i2info(inode)->lli_clob;
2729 io->ci_ignore_layout = ignore_layout;
2731 /* initialize parameters for sync */
2732 fio = &io->u.ci_fsync;
2733 fio->fi_capa = capa;
2734 fio->fi_start = start;
2736 fio->fi_fid = ll_inode2fid(inode);
2737 fio->fi_mode = mode;
2738 fio->fi_nr_written = 0;
2740 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2741 result = cl_io_loop(env, io);
2743 result = io->ci_result;
2745 result = fio->fi_nr_written;
2746 cl_io_fini(env, io);
2747 cl_env_nested_put(&nest, env);
2755 * When dentry is provided (the 'else' case), *file->f_dentry may be
2756 * null and dentry must be used directly rather than pulled from
2757 * *file->f_dentry as is done otherwise.
2760 #ifdef HAVE_FILE_FSYNC_4ARGS
2761 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2763 struct dentry *dentry = file->f_dentry;
2764 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2765 int ll_fsync(struct file *file, int datasync)
2767 struct dentry *dentry = file->f_dentry;
2769 loff_t end = LLONG_MAX;
2771 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2774 loff_t end = LLONG_MAX;
2776 struct inode *inode = dentry->d_inode;
2777 struct ll_inode_info *lli = ll_i2info(inode);
2778 struct ptlrpc_request *req;
2779 struct obd_capa *oc;
2783 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2784 PFID(ll_inode2fid(inode)), inode);
2785 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2787 #ifdef HAVE_FILE_FSYNC_4ARGS
2788 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2789 mutex_lock(&inode->i_mutex);
2791 /* fsync's caller has already called _fdata{sync,write}, we want
2792 * that IO to finish before calling the osc and mdc sync methods */
2793 rc = filemap_fdatawait(inode->i_mapping);
2796 /* catch async errors that were recorded back when async writeback
2797 * failed for pages in this mapping. */
2798 if (!S_ISDIR(inode->i_mode)) {
2799 err = lli->lli_async_rc;
2800 lli->lli_async_rc = 0;
2803 err = lov_read_and_clear_async_rc(lli->lli_clob);
2808 oc = ll_mdscapa_get(inode);
2809 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2815 ptlrpc_req_finished(req);
2817 if (S_ISREG(inode->i_mode)) {
2818 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2820 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2821 if (rc == 0 && err < 0)
2824 fd->fd_write_failed = true;
2826 fd->fd_write_failed = false;
2829 #ifdef HAVE_FILE_FSYNC_4ARGS
2830 mutex_unlock(&inode->i_mutex);
2836 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2838 struct inode *inode = file->f_dentry->d_inode;
2839 struct ll_sb_info *sbi = ll_i2sbi(inode);
2840 struct ldlm_enqueue_info einfo = {
2841 .ei_type = LDLM_FLOCK,
2842 .ei_cb_cp = ldlm_flock_completion_ast,
2843 .ei_cbdata = file_lock,
2845 struct md_op_data *op_data;
2846 struct lustre_handle lockh = {0};
2847 ldlm_policy_data_t flock = {{0}};
2848 int fl_type = file_lock->fl_type;
2854 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2855 PFID(ll_inode2fid(inode)), file_lock);
2857 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2859 if (file_lock->fl_flags & FL_FLOCK) {
2860 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2861 /* flocks are whole-file locks */
2862 flock.l_flock.end = OFFSET_MAX;
2863 /* For flocks owner is determined by the local file desctiptor*/
2864 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2865 } else if (file_lock->fl_flags & FL_POSIX) {
2866 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2867 flock.l_flock.start = file_lock->fl_start;
2868 flock.l_flock.end = file_lock->fl_end;
2872 flock.l_flock.pid = file_lock->fl_pid;
2874 /* Somewhat ugly workaround for svc lockd.
2875 * lockd installs custom fl_lmops->lm_compare_owner that checks
2876 * for the fl_owner to be the same (which it always is on local node
2877 * I guess between lockd processes) and then compares pid.
2878 * As such we assign pid to the owner field to make it all work,
2879 * conflict with normal locks is unlikely since pid space and
2880 * pointer space for current->files are not intersecting */
2881 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2882 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2886 einfo.ei_mode = LCK_PR;
2889 /* An unlock request may or may not have any relation to
2890 * existing locks so we may not be able to pass a lock handle
2891 * via a normal ldlm_lock_cancel() request. The request may even
2892 * unlock a byte range in the middle of an existing lock. In
2893 * order to process an unlock request we need all of the same
2894 * information that is given with a normal read or write record
2895 * lock request. To avoid creating another ldlm unlock (cancel)
2896 * message we'll treat a LCK_NL flock request as an unlock. */
2897 einfo.ei_mode = LCK_NL;
2900 einfo.ei_mode = LCK_PW;
2903 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2918 flags = LDLM_FL_BLOCK_NOWAIT;
2924 flags = LDLM_FL_TEST_LOCK;
2927 CERROR("unknown fcntl lock command: %d\n", cmd);
2931 /* Save the old mode so that if the mode in the lock changes we
2932 * can decrement the appropriate reader or writer refcount. */
2933 file_lock->fl_type = einfo.ei_mode;
2935 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2936 LUSTRE_OPC_ANY, NULL);
2937 if (IS_ERR(op_data))
2938 RETURN(PTR_ERR(op_data));
2940 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2941 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2942 flock.l_flock.pid, flags, einfo.ei_mode,
2943 flock.l_flock.start, flock.l_flock.end);
2945 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2948 /* Restore the file lock type if not TEST lock. */
2949 if (!(flags & LDLM_FL_TEST_LOCK))
2950 file_lock->fl_type = fl_type;
2952 if ((file_lock->fl_flags & FL_FLOCK) &&
2953 (rc == 0 || file_lock->fl_type == F_UNLCK))
2954 rc2 = flock_lock_file_wait(file, file_lock);
2955 if ((file_lock->fl_flags & FL_POSIX) &&
2956 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2957 !(flags & LDLM_FL_TEST_LOCK))
2958 rc2 = posix_lock_file_wait(file, file_lock);
2960 if (rc2 && file_lock->fl_type != F_UNLCK) {
2961 einfo.ei_mode = LCK_NL;
2962 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2967 ll_finish_md_op_data(op_data);
2972 int ll_get_fid_by_name(struct inode *parent, const char *name,
2973 int namelen, struct lu_fid *fid)
2975 struct md_op_data *op_data = NULL;
2976 struct mdt_body *body;
2977 struct ptlrpc_request *req;
2981 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2982 LUSTRE_OPC_ANY, NULL);
2983 if (IS_ERR(op_data))
2984 RETURN(PTR_ERR(op_data));
2986 op_data->op_valid = OBD_MD_FLID;
2987 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2988 ll_finish_md_op_data(op_data);
2992 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2994 GOTO(out_req, rc = -EFAULT);
2996 *fid = body->mbo_fid1;
2998 ptlrpc_req_finished(req);
3002 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3003 const char *name, int namelen)
3005 struct dentry *dchild = NULL;
3006 struct inode *child_inode = NULL;
3007 struct md_op_data *op_data;
3008 struct ptlrpc_request *request = NULL;
3013 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3014 name, PFID(ll_inode2fid(parent)), mdtidx);
3016 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3017 0, LUSTRE_OPC_ANY, NULL);
3018 if (IS_ERR(op_data))
3019 RETURN(PTR_ERR(op_data));
3021 /* Get child FID first */
3022 qstr.hash = full_name_hash(name, namelen);
3025 dchild = d_lookup(file->f_dentry, &qstr);
3026 if (dchild != NULL && dchild->d_inode != NULL) {
3027 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3028 if (dchild->d_inode != NULL) {
3029 child_inode = igrab(dchild->d_inode);
3030 ll_invalidate_aliases(child_inode);
3034 rc = ll_get_fid_by_name(parent, name, namelen,
3040 if (!fid_is_sane(&op_data->op_fid3)) {
3041 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3042 ll_get_fsname(parent->i_sb, NULL, 0), name,
3043 PFID(&op_data->op_fid3));
3044 GOTO(out_free, rc = -EINVAL);
3047 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3052 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3053 PFID(&op_data->op_fid3), mdtidx);
3054 GOTO(out_free, rc = 0);
3057 op_data->op_mds = mdtidx;
3058 op_data->op_cli_flags = CLI_MIGRATE;
3059 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3060 namelen, name, namelen, &request);
3062 ll_update_times(request, parent);
3064 ptlrpc_req_finished(request);
3069 if (child_inode != NULL) {
3070 clear_nlink(child_inode);
3074 ll_finish_md_op_data(op_data);
3079 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3087 * test if some locks matching bits and l_req_mode are acquired
3088 * - bits can be in different locks
3089 * - if found clear the common lock bits in *bits
3090 * - the bits not found, are kept in *bits
3092 * \param bits [IN] searched lock bits [IN]
3093 * \param l_req_mode [IN] searched lock mode
3094 * \retval boolean, true iff all bits are found
3096 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3098 struct lustre_handle lockh;
3099 ldlm_policy_data_t policy;
3100 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3101 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3110 fid = &ll_i2info(inode)->lli_fid;
3111 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3112 ldlm_lockname[mode]);
3114 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3115 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3116 policy.l_inodebits.bits = *bits & (1 << i);
3117 if (policy.l_inodebits.bits == 0)
3120 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3121 &policy, mode, &lockh)) {
3122 struct ldlm_lock *lock;
3124 lock = ldlm_handle2lock(&lockh);
3127 ~(lock->l_policy_data.l_inodebits.bits);
3128 LDLM_LOCK_PUT(lock);
3130 *bits &= ~policy.l_inodebits.bits;
3137 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3138 struct lustre_handle *lockh, __u64 flags,
3141 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3146 fid = &ll_i2info(inode)->lli_fid;
3147 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3149 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3150 fid, LDLM_IBITS, &policy, mode, lockh);
3155 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3157 /* Already unlinked. Just update nlink and return success */
3158 if (rc == -ENOENT) {
3160 /* This path cannot be hit for regular files unless in
3161 * case of obscure races, so no need to to validate
3163 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3165 } else if (rc != 0) {
3166 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3167 "%s: revalidate FID "DFID" error: rc = %d\n",
3168 ll_get_fsname(inode->i_sb, NULL, 0),
3169 PFID(ll_inode2fid(inode)), rc);
3175 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3177 struct inode *inode = dentry->d_inode;
3178 struct ptlrpc_request *req = NULL;
3179 struct obd_export *exp;
3183 LASSERT(inode != NULL);
3185 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3186 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3188 exp = ll_i2mdexp(inode);
3190 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3191 * But under CMD case, it caused some lock issues, should be fixed
3192 * with new CMD ibits lock. See bug 12718 */
3193 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3194 struct lookup_intent oit = { .it_op = IT_GETATTR };
3195 struct md_op_data *op_data;
3197 if (ibits == MDS_INODELOCK_LOOKUP)
3198 oit.it_op = IT_LOOKUP;
3200 /* Call getattr by fid, so do not provide name at all. */
3201 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3202 dentry->d_inode, NULL, 0, 0,
3203 LUSTRE_OPC_ANY, NULL);
3204 if (IS_ERR(op_data))
3205 RETURN(PTR_ERR(op_data));
3207 rc = md_intent_lock(exp, op_data, &oit, &req,
3208 &ll_md_blocking_ast, 0);
3209 ll_finish_md_op_data(op_data);
3211 rc = ll_inode_revalidate_fini(inode, rc);
3215 rc = ll_revalidate_it_finish(req, &oit, dentry);
3217 ll_intent_release(&oit);
3221 /* Unlinked? Unhash dentry, so it is not picked up later by
3222 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3223 here to preserve get_cwd functionality on 2.6.
3225 if (!dentry->d_inode->i_nlink)
3226 d_lustre_invalidate(dentry, 0);
3228 ll_lookup_finish_locks(&oit, dentry);
3229 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3230 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3231 obd_valid valid = OBD_MD_FLGETATTR;
3232 struct md_op_data *op_data;
3235 if (S_ISREG(inode->i_mode)) {
3236 rc = ll_get_default_mdsize(sbi, &ealen);
3239 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3242 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3243 0, ealen, LUSTRE_OPC_ANY,
3245 if (IS_ERR(op_data))
3246 RETURN(PTR_ERR(op_data));
3248 op_data->op_valid = valid;
3249 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3250 * capa for this inode. Because we only keep capas of dirs
3252 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3253 ll_finish_md_op_data(op_data);
3255 rc = ll_inode_revalidate_fini(inode, rc);
3259 rc = ll_prep_inode(&inode, req, NULL, NULL);
3262 ptlrpc_req_finished(req);
3266 static int ll_merge_md_attr(struct inode *inode)
3268 struct cl_attr attr = { 0 };
3271 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3272 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3277 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3278 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3280 ll_i2info(inode)->lli_atime = attr.cat_atime;
3281 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3282 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3288 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3290 struct inode *inode = dentry->d_inode;
3294 rc = __ll_inode_revalidate(dentry, ibits);
3298 /* if object isn't regular file, don't validate size */
3299 if (!S_ISREG(inode->i_mode)) {
3300 if (S_ISDIR(inode->i_mode) &&
3301 ll_i2info(inode)->lli_lsm_md != NULL) {
3302 rc = ll_merge_md_attr(inode);
3307 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3308 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3309 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3311 /* In case of restore, the MDT has the right size and has
3312 * already send it back without granting the layout lock,
3313 * inode is up-to-date so glimpse is useless.
3314 * Also to glimpse we need the layout, in case of a running
3315 * restore the MDT holds the layout lock so the glimpse will
3316 * block up to the end of restore (getattr will block)
3318 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3319 rc = ll_glimpse_size(inode);
3324 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3326 struct inode *inode = de->d_inode;
3327 struct ll_sb_info *sbi = ll_i2sbi(inode);
3328 struct ll_inode_info *lli = ll_i2info(inode);
3331 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3332 MDS_INODELOCK_LOOKUP);
3333 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3338 stat->dev = inode->i_sb->s_dev;
3339 if (ll_need_32bit_api(sbi))
3340 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3342 stat->ino = inode->i_ino;
3343 stat->mode = inode->i_mode;
3344 stat->uid = inode->i_uid;
3345 stat->gid = inode->i_gid;
3346 stat->rdev = inode->i_rdev;
3347 stat->atime = inode->i_atime;
3348 stat->mtime = inode->i_mtime;
3349 stat->ctime = inode->i_ctime;
3350 stat->blksize = 1 << inode->i_blkbits;
3351 stat->blocks = inode->i_blocks;
3353 if (S_ISDIR(inode->i_mode) &&
3354 ll_i2info(inode)->lli_lsm_md != NULL) {
3355 stat->nlink = lli->lli_stripe_dir_nlink;
3356 stat->size = lli->lli_stripe_dir_size;
3358 stat->nlink = inode->i_nlink;
3359 stat->size = i_size_read(inode);
3365 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3366 __u64 start, __u64 len)
3370 struct ll_user_fiemap *fiemap;
3371 unsigned int extent_count = fieinfo->fi_extents_max;
3373 num_bytes = sizeof(*fiemap) + (extent_count *
3374 sizeof(struct ll_fiemap_extent));
3375 OBD_ALLOC_LARGE(fiemap, num_bytes);
3380 fiemap->fm_flags = fieinfo->fi_flags;
3381 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3382 fiemap->fm_start = start;
3383 fiemap->fm_length = len;
3384 if (extent_count > 0)
3385 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3386 sizeof(struct ll_fiemap_extent));
3388 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3390 fieinfo->fi_flags = fiemap->fm_flags;
3391 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3392 if (extent_count > 0)
3393 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3394 fiemap->fm_mapped_extents *
3395 sizeof(struct ll_fiemap_extent));
3397 OBD_FREE_LARGE(fiemap, num_bytes);
3401 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3403 struct ll_inode_info *lli = ll_i2info(inode);
3404 struct posix_acl *acl = NULL;
3407 spin_lock(&lli->lli_lock);
3408 /* VFS' acl_permission_check->check_acl will release the refcount */
3409 acl = posix_acl_dup(lli->lli_posix_acl);
3410 spin_unlock(&lli->lli_lock);
3415 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3417 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3418 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3420 ll_check_acl(struct inode *inode, int mask)
3423 # ifdef CONFIG_FS_POSIX_ACL
3424 struct posix_acl *acl;
3428 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3429 if (flags & IPERM_FLAG_RCU)
3432 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3437 rc = posix_acl_permission(inode, acl, mask);
3438 posix_acl_release(acl);
3441 # else /* !CONFIG_FS_POSIX_ACL */
3443 # endif /* CONFIG_FS_POSIX_ACL */
3445 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3447 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3448 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3450 # ifdef HAVE_INODE_PERMISION_2ARGS
3451 int ll_inode_permission(struct inode *inode, int mask)
3453 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3458 struct ll_sb_info *sbi;
3459 struct root_squash_info *squash;
3460 struct cred *cred = NULL;
3461 const struct cred *old_cred = NULL;
3463 bool squash_id = false;
3466 #ifdef MAY_NOT_BLOCK
3467 if (mask & MAY_NOT_BLOCK)
3469 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3470 if (flags & IPERM_FLAG_RCU)
3474 /* as root inode are NOT getting validated in lookup operation,
3475 * need to do it before permission check. */
3477 if (inode == inode->i_sb->s_root->d_inode) {
3478 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3479 MDS_INODELOCK_LOOKUP);
3484 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3485 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3487 /* squash fsuid/fsgid if needed */
3488 sbi = ll_i2sbi(inode);
3489 squash = &sbi->ll_squash;
3490 if (unlikely(squash->rsi_uid != 0 &&
3491 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3492 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3496 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3497 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3498 squash->rsi_uid, squash->rsi_gid);
3500 /* update current process's credentials
3501 * and FS capability */
3502 cred = prepare_creds();
3506 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3507 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3508 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3509 if ((1 << cap) & CFS_CAP_FS_MASK)
3510 cap_lower(cred->cap_effective, cap);
3512 old_cred = override_creds(cred);
3515 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3517 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3518 rc = lustre_check_remote_perm(inode, mask);
3520 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3522 /* restore current process's credentials and FS capability */
3524 revert_creds(old_cred);
3531 /* -o localflock - only provides locally consistent flock locks */
3532 struct file_operations ll_file_operations = {
3533 .read = ll_file_read,
3534 .aio_read = ll_file_aio_read,
3535 .write = ll_file_write,
3536 .aio_write = ll_file_aio_write,
3537 .unlocked_ioctl = ll_file_ioctl,
3538 .open = ll_file_open,
3539 .release = ll_file_release,
3540 .mmap = ll_file_mmap,
3541 .llseek = ll_file_seek,
3542 .splice_read = ll_file_splice_read,
3547 struct file_operations ll_file_operations_flock = {
3548 .read = ll_file_read,
3549 .aio_read = ll_file_aio_read,
3550 .write = ll_file_write,
3551 .aio_write = ll_file_aio_write,
3552 .unlocked_ioctl = ll_file_ioctl,
3553 .open = ll_file_open,
3554 .release = ll_file_release,
3555 .mmap = ll_file_mmap,
3556 .llseek = ll_file_seek,
3557 .splice_read = ll_file_splice_read,
3560 .flock = ll_file_flock,
3561 .lock = ll_file_flock
3564 /* These are for -o noflock - to return ENOSYS on flock calls */
3565 struct file_operations ll_file_operations_noflock = {
3566 .read = ll_file_read,
3567 .aio_read = ll_file_aio_read,
3568 .write = ll_file_write,
3569 .aio_write = ll_file_aio_write,
3570 .unlocked_ioctl = ll_file_ioctl,
3571 .open = ll_file_open,
3572 .release = ll_file_release,
3573 .mmap = ll_file_mmap,
3574 .llseek = ll_file_seek,
3575 .splice_read = ll_file_splice_read,
3578 .flock = ll_file_noflock,
3579 .lock = ll_file_noflock
3582 struct inode_operations ll_file_inode_operations = {
3583 .setattr = ll_setattr,
3584 .getattr = ll_getattr,
3585 .permission = ll_inode_permission,
3586 .setxattr = ll_setxattr,
3587 .getxattr = ll_getxattr,
3588 .listxattr = ll_listxattr,
3589 .removexattr = ll_removexattr,
3590 .fiemap = ll_fiemap,
3591 #ifdef HAVE_IOP_GET_ACL
3592 .get_acl = ll_get_acl,
3596 /* dynamic ioctl number support routins */
3597 static struct llioc_ctl_data {
3598 struct rw_semaphore ioc_sem;
3599 struct list_head ioc_head;
3601 __RWSEM_INITIALIZER(llioc.ioc_sem),
3602 LIST_HEAD_INIT(llioc.ioc_head)
3607 struct list_head iocd_list;
3608 unsigned int iocd_size;
3609 llioc_callback_t iocd_cb;
3610 unsigned int iocd_count;
3611 unsigned int iocd_cmd[0];
3614 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3617 struct llioc_data *in_data = NULL;
3620 if (cb == NULL || cmd == NULL ||
3621 count > LLIOC_MAX_CMD || count < 0)
3624 size = sizeof(*in_data) + count * sizeof(unsigned int);
3625 OBD_ALLOC(in_data, size);
3626 if (in_data == NULL)
3629 memset(in_data, 0, sizeof(*in_data));
3630 in_data->iocd_size = size;
3631 in_data->iocd_cb = cb;
3632 in_data->iocd_count = count;
3633 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3635 down_write(&llioc.ioc_sem);
3636 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3637 up_write(&llioc.ioc_sem);
3642 void ll_iocontrol_unregister(void *magic)
3644 struct llioc_data *tmp;
3649 down_write(&llioc.ioc_sem);
3650 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3652 unsigned int size = tmp->iocd_size;
3654 list_del(&tmp->iocd_list);
3655 up_write(&llioc.ioc_sem);
3657 OBD_FREE(tmp, size);
3661 up_write(&llioc.ioc_sem);
3663 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3666 EXPORT_SYMBOL(ll_iocontrol_register);
3667 EXPORT_SYMBOL(ll_iocontrol_unregister);
3669 static enum llioc_iter
3670 ll_iocontrol_call(struct inode *inode, struct file *file,
3671 unsigned int cmd, unsigned long arg, int *rcp)
3673 enum llioc_iter ret = LLIOC_CONT;
3674 struct llioc_data *data;
3675 int rc = -EINVAL, i;
3677 down_read(&llioc.ioc_sem);
3678 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3679 for (i = 0; i < data->iocd_count; i++) {
3680 if (cmd != data->iocd_cmd[i])
3683 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3687 if (ret == LLIOC_STOP)
3690 up_read(&llioc.ioc_sem);
3697 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3699 struct ll_inode_info *lli = ll_i2info(inode);
3700 struct cl_env_nest nest;
3705 if (lli->lli_clob == NULL)
3708 env = cl_env_nested_get(&nest);
3710 RETURN(PTR_ERR(env));
3712 result = cl_conf_set(env, lli->lli_clob, conf);
3713 cl_env_nested_put(&nest, env);
3715 if (conf->coc_opc == OBJECT_CONF_SET) {
3716 struct ldlm_lock *lock = conf->coc_lock;
3718 LASSERT(lock != NULL);
3719 LASSERT(ldlm_has_layout(lock));
3721 struct lustre_md *md = conf->u.coc_md;
3722 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3724 /* it can only be allowed to match after layout is
3725 * applied to inode otherwise false layout would be
3726 * seen. Applying layout shoud happen before dropping
3727 * the intent lock. */
3728 ldlm_lock_allow_match(lock);
3730 lli->lli_has_smd = lsm_has_objects(md->lsm);
3731 if (md->lsm != NULL)
3732 gen = md->lsm->lsm_layout_gen;
3735 DFID ": layout version change: %u -> %u\n",
3736 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3738 ll_layout_version_set(lli, gen);
3744 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3745 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3748 struct ll_sb_info *sbi = ll_i2sbi(inode);
3749 struct obd_capa *oc;
3750 struct ptlrpc_request *req;
3751 struct mdt_body *body;
3758 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3759 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3760 lock->l_lvb_data, lock->l_lvb_len);
3762 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3765 /* if layout lock was granted right away, the layout is returned
3766 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3767 * blocked and then granted via completion ast, we have to fetch
3768 * layout here. Please note that we can't use the LVB buffer in
3769 * completion AST because it doesn't have a large enough buffer */
3770 oc = ll_mdscapa_get(inode);
3771 rc = ll_get_default_mdsize(sbi, &lmmsize);
3773 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3774 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3780 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3782 GOTO(out, rc = -EPROTO);
3784 lmmsize = body->mbo_eadatasize;
3785 if (lmmsize == 0) /* empty layout */
3788 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3790 GOTO(out, rc = -EFAULT);
3792 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3793 if (lvbdata == NULL)
3794 GOTO(out, rc = -ENOMEM);
3796 memcpy(lvbdata, lmm, lmmsize);
3797 lock_res_and_lock(lock);
3798 if (lock->l_lvb_data != NULL)
3799 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3801 lock->l_lvb_data = lvbdata;
3802 lock->l_lvb_len = lmmsize;
3803 unlock_res_and_lock(lock);
3808 ptlrpc_req_finished(req);
3813 * Apply the layout to the inode. Layout lock is held and will be released
3816 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3817 struct inode *inode, __u32 *gen, bool reconf)
3819 struct ll_inode_info *lli = ll_i2info(inode);
3820 struct ll_sb_info *sbi = ll_i2sbi(inode);
3821 struct ldlm_lock *lock;
3822 struct lustre_md md = { NULL };
3823 struct cl_object_conf conf;
3826 bool wait_layout = false;
3829 LASSERT(lustre_handle_is_used(lockh));
3831 lock = ldlm_handle2lock(lockh);
3832 LASSERT(lock != NULL);
3833 LASSERT(ldlm_has_layout(lock));
3835 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3836 PFID(&lli->lli_fid), inode, reconf);
3838 /* in case this is a caching lock and reinstate with new inode */
3839 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3841 lock_res_and_lock(lock);
3842 lvb_ready = ldlm_is_lvb_ready(lock);
3843 unlock_res_and_lock(lock);
3844 /* checking lvb_ready is racy but this is okay. The worst case is
3845 * that multi processes may configure the file on the same time. */
3847 if (lvb_ready || !reconf) {
3850 /* layout_gen must be valid if layout lock is not
3851 * cancelled and stripe has already set */
3852 *gen = ll_layout_version_get(lli);
3858 rc = ll_layout_fetch(inode, lock);
3862 /* for layout lock, lmm is returned in lock's lvb.
3863 * lvb_data is immutable if the lock is held so it's safe to access it
3864 * without res lock. See the description in ldlm_lock_decref_internal()
3865 * for the condition to free lvb_data of layout lock */
3866 if (lock->l_lvb_data != NULL) {
3867 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3868 lock->l_lvb_data, lock->l_lvb_len);
3870 *gen = LL_LAYOUT_GEN_EMPTY;
3872 *gen = md.lsm->lsm_layout_gen;
3875 CERROR("%s: file "DFID" unpackmd error: %d\n",
3876 ll_get_fsname(inode->i_sb, NULL, 0),
3877 PFID(&lli->lli_fid), rc);
3883 /* set layout to file. Unlikely this will fail as old layout was
3884 * surely eliminated */
3885 memset(&conf, 0, sizeof conf);
3886 conf.coc_opc = OBJECT_CONF_SET;
3887 conf.coc_inode = inode;
3888 conf.coc_lock = lock;
3889 conf.u.coc_md = &md;
3890 rc = ll_layout_conf(inode, &conf);
3893 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3895 /* refresh layout failed, need to wait */
3896 wait_layout = rc == -EBUSY;
3900 LDLM_LOCK_PUT(lock);
3901 ldlm_lock_decref(lockh, mode);
3903 /* wait for IO to complete if it's still being used. */
3905 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3906 ll_get_fsname(inode->i_sb, NULL, 0),
3907 PFID(&lli->lli_fid), inode);
3909 memset(&conf, 0, sizeof conf);
3910 conf.coc_opc = OBJECT_CONF_WAIT;
3911 conf.coc_inode = inode;
3912 rc = ll_layout_conf(inode, &conf);
3916 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3917 ll_get_fsname(inode->i_sb, NULL, 0),
3918 PFID(&lli->lli_fid), rc);
3924 * This function checks if there exists a LAYOUT lock on the client side,
3925 * or enqueues it if it doesn't have one in cache.
3927 * This function will not hold layout lock so it may be revoked any time after
3928 * this function returns. Any operations depend on layout should be redone
3931 * This function should be called before lov_io_init() to get an uptodate
3932 * layout version, the caller should save the version number and after IO
3933 * is finished, this function should be called again to verify that layout
3934 * is not changed during IO time.
3936 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3938 struct ll_inode_info *lli = ll_i2info(inode);
3939 struct ll_sb_info *sbi = ll_i2sbi(inode);
3940 struct md_op_data *op_data;
3941 struct lookup_intent it;
3942 struct lustre_handle lockh;
3944 struct ldlm_enqueue_info einfo = {
3945 .ei_type = LDLM_IBITS,
3947 .ei_cb_bl = &ll_md_blocking_ast,
3948 .ei_cb_cp = &ldlm_completion_ast,
3953 *gen = ll_layout_version_get(lli);
3954 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3958 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3959 LASSERT(S_ISREG(inode->i_mode));
3961 /* take layout lock mutex to enqueue layout lock exclusively. */
3962 mutex_lock(&lli->lli_layout_mutex);
3965 /* mostly layout lock is caching on the local side, so try to match
3966 * it before grabbing layout lock mutex. */
3967 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3968 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3969 if (mode != 0) { /* hit cached lock */
3970 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3974 mutex_unlock(&lli->lli_layout_mutex);
3978 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3979 0, 0, LUSTRE_OPC_ANY, NULL);
3980 if (IS_ERR(op_data)) {
3981 mutex_unlock(&lli->lli_layout_mutex);
3982 RETURN(PTR_ERR(op_data));
3985 /* have to enqueue one */
3986 memset(&it, 0, sizeof(it));
3987 it.it_op = IT_LAYOUT;
3988 lockh.cookie = 0ULL;
3990 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3991 ll_get_fsname(inode->i_sb, NULL, 0),
3992 PFID(&lli->lli_fid), inode);
3994 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3995 if (it.d.lustre.it_data != NULL)
3996 ptlrpc_req_finished(it.d.lustre.it_data);
3997 it.d.lustre.it_data = NULL;
3999 ll_finish_md_op_data(op_data);
4001 mode = it.d.lustre.it_lock_mode;
4002 it.d.lustre.it_lock_mode = 0;
4003 ll_intent_drop_lock(&it);
4006 /* set lock data in case this is a new lock */
4007 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4008 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4012 mutex_unlock(&lli->lli_layout_mutex);
4018 * This function send a restore request to the MDT
4020 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4022 struct hsm_user_request *hur;
4026 len = sizeof(struct hsm_user_request) +
4027 sizeof(struct hsm_user_item);
4028 OBD_ALLOC(hur, len);
4032 hur->hur_request.hr_action = HUA_RESTORE;
4033 hur->hur_request.hr_archive_id = 0;
4034 hur->hur_request.hr_flags = 0;
4035 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4036 sizeof(hur->hur_user_item[0].hui_fid));
4037 hur->hur_user_item[0].hui_extent.offset = offset;
4038 hur->hur_user_item[0].hui_extent.length = length;
4039 hur->hur_request.hr_itemcount = 1;
4040 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,