4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
199 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
200 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
204 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
273 /* clear group lock, if present */
274 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
275 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
277 if (fd->fd_lease_och != NULL) {
280 /* Usually the lease is not released when the
281 * application crashed, we need to release here. */
282 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
283 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
284 PFID(&lli->lli_fid), rc, lease_broken);
286 fd->fd_lease_och = NULL;
289 if (fd->fd_och != NULL) {
290 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
295 /* Let's see if we have good enough OPEN lock on the file and if
296 we can skip talking to MDS */
297 if (file->f_dentry->d_inode) { /* Can this ever be false? */
299 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
300 struct lustre_handle lockh;
301 struct inode *inode = file->f_dentry->d_inode;
302 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
304 mutex_lock(&lli->lli_och_mutex);
305 if (fd->fd_omode & FMODE_WRITE) {
307 LASSERT(lli->lli_open_fd_write_count);
308 lli->lli_open_fd_write_count--;
309 } else if (fd->fd_omode & FMODE_EXEC) {
311 LASSERT(lli->lli_open_fd_exec_count);
312 lli->lli_open_fd_exec_count--;
315 LASSERT(lli->lli_open_fd_read_count);
316 lli->lli_open_fd_read_count--;
318 mutex_unlock(&lli->lli_och_mutex);
320 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
321 LDLM_IBITS, &policy, lockmode,
323 rc = ll_md_real_close(file->f_dentry->d_inode,
327 CERROR("released file has negative dentry: file = %p, "
328 "dentry = %p, name = %s\n",
329 file, file->f_dentry, file->f_dentry->d_name.name);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->mbo_handle;
487 och->och_fid = body->mbo_fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
492 return md_set_open_replay_data(md_exp, och, it);
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
502 LASSERT(!LUSTRE_FPRIVATE(file));
507 struct ptlrpc_request *req = it->d.lustre.it_data;
508 struct mdt_body *body;
511 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
515 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
516 ll_ioepoch_open(lli, body->mbo_ioepoch);
519 LUSTRE_FPRIVATE(file) = fd;
520 ll_readahead_init(inode, &fd->fd_ras);
521 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* ll_cl_context initialize */
524 rwlock_init(&fd->fd_lock);
525 INIT_LIST_HEAD(&fd->fd_lccs);
530 /* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used. We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 int ll_file_open(struct inode *inode, struct file *file)
545 struct ll_inode_info *lli = ll_i2info(inode);
546 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547 .it_flags = file->f_flags };
548 struct obd_client_handle **och_p = NULL;
549 __u64 *och_usecount = NULL;
550 struct ll_file_data *fd;
554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
555 PFID(ll_inode2fid(inode)), inode, file->f_flags);
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
560 fd = ll_file_data_get();
562 GOTO(out_openerr, rc = -ENOMEM);
565 if (S_ISDIR(inode->i_mode))
566 ll_authorize_statahead(inode, fd);
568 if (inode->i_sb->s_root == file->f_dentry) {
569 LUSTRE_FPRIVATE(file) = fd;
573 if (!it || !it->d.lustre.it_disposition) {
574 /* Convert f_flags into access mode. We cannot use file->f_mode,
575 * because everything but O_ACCMODE mask was stripped from
577 if ((oit.it_flags + 1) & O_ACCMODE)
579 if (file->f_flags & O_TRUNC)
580 oit.it_flags |= FMODE_WRITE;
582 /* kernel only call f_op->open in dentry_open. filp_open calls
583 * dentry_open after call to open_namei that checks permissions.
584 * Only nfsd_open call dentry_open directly without checking
585 * permissions and because of that this code below is safe. */
586 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
587 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589 /* We do not want O_EXCL here, presumably we opened the file
590 * already? XXX - NFS implications? */
591 oit.it_flags &= ~O_EXCL;
593 /* bug20584, if "it_flags" contains O_CREAT, the file will be
594 * created if necessary, then "IT_CREAT" should be set to keep
595 * consistent with it */
596 if (oit.it_flags & O_CREAT)
597 oit.it_op |= IT_CREAT;
603 /* Let's see if we have file open on MDS already. */
604 if (it->it_flags & FMODE_WRITE) {
605 och_p = &lli->lli_mds_write_och;
606 och_usecount = &lli->lli_open_fd_write_count;
607 } else if (it->it_flags & FMODE_EXEC) {
608 och_p = &lli->lli_mds_exec_och;
609 och_usecount = &lli->lli_open_fd_exec_count;
611 och_p = &lli->lli_mds_read_och;
612 och_usecount = &lli->lli_open_fd_read_count;
615 mutex_lock(&lli->lli_och_mutex);
616 if (*och_p) { /* Open handle is present */
617 if (it_disposition(it, DISP_OPEN_OPEN)) {
618 /* Well, there's extra open request that we do not need,
619 let's close it somehow. This will decref request. */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 ll_release_openhandle(file->f_dentry, it);
630 rc = ll_local_open(file, it, fd, NULL);
633 mutex_unlock(&lli->lli_och_mutex);
634 GOTO(out_openerr, rc);
637 LASSERT(*och_usecount == 0);
638 if (!it->d.lustre.it_disposition) {
639 /* We cannot just request lock handle now, new ELC code
640 means that one of other OPEN locks for this file
641 could be cancelled, and since blocking ast handler
642 would attempt to grab och_mutex as well, that would
643 result in a deadlock */
644 mutex_unlock(&lli->lli_och_mutex);
646 * Normally called under two situations:
648 * 2. A race/condition on MDS resulting in no open
649 * handle to be returned from LOOKUP|OPEN request,
650 * for example if the target entry was a symlink.
652 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
654 * Always specify MDS_OPEN_BY_FID because we don't want
655 * to get file with different fid.
657 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
658 rc = ll_intent_file_open(file, NULL, 0, it);
660 GOTO(out_openerr, rc);
664 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
666 GOTO(out_och_free, rc = -ENOMEM);
670 /* md_intent_lock() didn't get a request ref if there was an
671 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
674 * just open error? */
675 rc = it_open_error(DISP_OPEN_OPEN, it);
677 GOTO(out_och_free, rc);
679 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
680 "inode %p: disposition %x, status %d\n", inode,
681 it_disposition(it, ~0), it->d.lustre.it_status);
683 rc = ll_local_open(file, it, fd, *och_p);
685 GOTO(out_och_free, rc);
687 mutex_unlock(&lli->lli_och_mutex);
690 /* Must do this outside lli_och_mutex lock to prevent deadlock where
691 different kind of OPEN lock for this same inode gets cancelled
692 by ldlm_cancel_lru */
693 if (!S_ISREG(inode->i_mode))
694 GOTO(out_och_free, rc);
698 if (!lli->lli_has_smd &&
699 (cl_is_lov_delay_create(file->f_flags) ||
700 (file->f_mode & FMODE_WRITE) == 0)) {
701 CDEBUG(D_INODE, "object creation was delayed\n");
702 GOTO(out_och_free, rc);
704 cl_lov_delay_create_clear(&file->f_flags);
705 GOTO(out_och_free, rc);
709 if (och_p && *och_p) {
710 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
711 *och_p = NULL; /* OBD_FREE writes some magic there */
714 mutex_unlock(&lli->lli_och_mutex);
717 if (lli->lli_opendir_key == fd)
718 ll_deauthorize_statahead(inode, fd);
720 ll_file_data_put(fd);
722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
725 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
726 ptlrpc_req_finished(it->d.lustre.it_data);
727 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
733 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
734 struct ldlm_lock_desc *desc, void *data, int flag)
737 struct lustre_handle lockh;
741 case LDLM_CB_BLOCKING:
742 ldlm_lock2handle(lock, &lockh);
743 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
745 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
749 case LDLM_CB_CANCELING:
757 * Acquire a lease and open the file.
759 static struct obd_client_handle *
760 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
763 struct lookup_intent it = { .it_op = IT_OPEN };
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
765 struct md_op_data *op_data;
766 struct ptlrpc_request *req = NULL;
767 struct lustre_handle old_handle = { 0 };
768 struct obd_client_handle *och = NULL;
773 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
774 RETURN(ERR_PTR(-EINVAL));
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
779 struct obd_client_handle **och_p;
782 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
783 RETURN(ERR_PTR(-EPERM));
785 /* Get the openhandle of the file */
787 mutex_lock(&lli->lli_och_mutex);
788 if (fd->fd_lease_och != NULL) {
789 mutex_unlock(&lli->lli_och_mutex);
793 if (fd->fd_och == NULL) {
794 if (file->f_mode & FMODE_WRITE) {
795 LASSERT(lli->lli_mds_write_och != NULL);
796 och_p = &lli->lli_mds_write_och;
797 och_usecount = &lli->lli_open_fd_write_count;
799 LASSERT(lli->lli_mds_read_och != NULL);
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
803 if (*och_usecount == 1) {
810 mutex_unlock(&lli->lli_och_mutex);
811 if (rc < 0) /* more than 1 opener */
814 LASSERT(fd->fd_och != NULL);
815 old_handle = fd->fd_och->och_fh;
820 RETURN(ERR_PTR(-ENOMEM));
822 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
823 LUSTRE_OPC_ANY, NULL);
825 GOTO(out, rc = PTR_ERR(op_data));
827 /* To tell the MDT this openhandle is from the same owner */
828 op_data->op_handle = old_handle;
830 it.it_flags = fmode | open_flags;
831 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
832 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
833 &ll_md_blocking_lease_ast,
834 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
835 * it can be cancelled which may mislead applications that the lease is
837 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
838 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
839 * doesn't deal with openhandle, so normal openhandle will be leaked. */
840 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
841 ll_finish_md_op_data(op_data);
842 ptlrpc_req_finished(req);
844 GOTO(out_release_it, rc);
846 if (it_disposition(&it, DISP_LOOKUP_NEG))
847 GOTO(out_release_it, rc = -ENOENT);
849 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 GOTO(out_release_it, rc);
853 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854 ll_och_fill(sbi->ll_md_exp, &it, och);
856 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
857 GOTO(out_close, rc = -EOPNOTSUPP);
859 /* already get lease, handle lease lock */
860 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
861 if (it.d.lustre.it_lock_mode == 0 ||
862 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
863 /* open lock must return for lease */
864 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
865 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
866 it.d.lustre.it_lock_bits);
867 GOTO(out_close, rc = -EPROTO);
870 ll_intent_release(&it);
874 /* Cancel open lock */
875 if (it.d.lustre.it_lock_mode != 0) {
876 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
877 it.d.lustre.it_lock_mode);
878 it.d.lustre.it_lock_mode = 0;
879 och->och_lease_handle.cookie = 0ULL;
881 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
883 CERROR("%s: error closing file "DFID": %d\n",
884 ll_get_fsname(inode->i_sb, NULL, 0),
885 PFID(&ll_i2info(inode)->lli_fid), rc2);
886 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
888 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
907 lock = ldlm_handle2lock(&och->och_lease_handle);
909 lock_res_and_lock(lock);
910 cancelled = ldlm_is_cancel(lock);
911 unlock_res_and_lock(lock);
915 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
916 PFID(&ll_i2info(inode)->lli_fid), cancelled);
919 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 if (lease_broken != NULL)
921 *lease_broken = cancelled;
923 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
928 /* Fills the obdo with the attributes for the lsm */
929 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
930 struct obd_capa *capa, struct obdo *obdo,
931 __u64 ioepoch, int dv_flags)
933 struct ptlrpc_request_set *set;
934 struct obd_info oinfo = { { { 0 } } };
939 LASSERT(lsm != NULL);
943 oinfo.oi_oa->o_oi = lsm->lsm_oi;
944 oinfo.oi_oa->o_mode = S_IFREG;
945 oinfo.oi_oa->o_ioepoch = ioepoch;
946 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
947 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
948 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
949 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
950 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
951 OBD_MD_FLDATAVERSION;
952 oinfo.oi_capa = capa;
953 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
954 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
955 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
956 if (dv_flags & LL_DV_WR_FLUSH)
957 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
960 set = ptlrpc_prep_set();
962 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
965 rc = obd_getattr_async(exp, &oinfo, set);
967 rc = ptlrpc_set_wait(set);
968 ptlrpc_set_destroy(set);
971 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
972 OBD_MD_FLATIME | OBD_MD_FLMTIME |
973 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
974 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
975 if (dv_flags & LL_DV_WR_FLUSH &&
976 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
977 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
984 * Performs the getattr on the inode and updates its fields.
985 * If @sync != 0, perform the getattr under the server-side lock.
987 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
988 __u64 ioepoch, int sync)
990 struct obd_capa *capa = ll_mdscapa_get(inode);
991 struct lov_stripe_md *lsm;
995 lsm = ccc_inode_lsm_get(inode);
996 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
997 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1000 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1002 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1003 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1004 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1005 (unsigned long long)inode->i_blocks,
1006 1UL << inode->i_blkbits);
1008 ccc_inode_lsm_put(inode, lsm);
1012 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1014 struct ll_inode_info *lli = ll_i2info(inode);
1015 struct cl_object *obj = lli->lli_clob;
1016 struct cl_attr *attr = ccc_env_thread_attr(env);
1024 ll_inode_size_lock(inode);
1026 /* merge timestamps the most recently obtained from mds with
1027 timestamps obtained from osts */
1028 LTIME_S(inode->i_atime) = lli->lli_atime;
1029 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1030 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1032 atime = LTIME_S(inode->i_atime);
1033 mtime = LTIME_S(inode->i_mtime);
1034 ctime = LTIME_S(inode->i_ctime);
1036 cl_object_attr_lock(obj);
1037 rc = cl_object_attr_get(env, obj, attr);
1038 cl_object_attr_unlock(obj);
1041 GOTO(out_size_unlock, rc);
1043 if (atime < attr->cat_atime)
1044 atime = attr->cat_atime;
1046 if (ctime < attr->cat_ctime)
1047 ctime = attr->cat_ctime;
1049 if (mtime < attr->cat_mtime)
1050 mtime = attr->cat_mtime;
1052 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1053 PFID(&lli->lli_fid), attr->cat_size);
1055 i_size_write(inode, attr->cat_size);
1056 inode->i_blocks = attr->cat_blocks;
1058 LTIME_S(inode->i_atime) = atime;
1059 LTIME_S(inode->i_mtime) = mtime;
1060 LTIME_S(inode->i_ctime) = ctime;
1063 ll_inode_size_unlock(inode);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct obdo obdo = { 0 };
1074 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1076 st->st_size = obdo.o_size;
1077 st->st_blocks = obdo.o_blocks;
1078 st->st_mtime = obdo.o_mtime;
1079 st->st_atime = obdo.o_atime;
1080 st->st_ctime = obdo.o_ctime;
1085 static bool file_is_noatime(const struct file *file)
1087 const struct vfsmount *mnt = file->f_path.mnt;
1088 const struct inode *inode = file->f_path.dentry->d_inode;
1090 /* Adapted from file_accessed() and touch_atime().*/
1091 if (file->f_flags & O_NOATIME)
1094 if (inode->i_flags & S_NOATIME)
1097 if (IS_NOATIME(inode))
1100 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1103 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1106 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1112 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1114 struct inode *inode = file->f_dentry->d_inode;
1116 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1118 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1119 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1120 file->f_flags & O_DIRECT ||
1123 io->ci_obj = ll_i2info(inode)->lli_clob;
1124 io->ci_lockreq = CILR_MAYBE;
1125 if (ll_file_nolock(file)) {
1126 io->ci_lockreq = CILR_NEVER;
1127 io->ci_no_srvlock = 1;
1128 } else if (file->f_flags & O_APPEND) {
1129 io->ci_lockreq = CILR_MANDATORY;
1132 io->ci_noatime = file_is_noatime(file);
1136 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1137 struct file *file, enum cl_io_type iot,
1138 loff_t *ppos, size_t count)
1140 struct inode *inode = file->f_dentry->d_inode;
1141 struct ll_inode_info *lli = ll_i2info(inode);
1143 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1146 struct range_lock range;
1149 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1150 file->f_dentry->d_name.name, iot, *ppos, count);
1153 io = ccc_env_thread_io(env);
1154 ll_io_init(io, file, iot == CIT_WRITE);
1156 /* The maximum Lustre file size is variable, based on the
1157 * OST maximum object size and number of stripes. This
1158 * needs another check in addition to the VFS checks earlier. */
1159 end = (io->u.ci_wr.wr_append ? i_size_read(inode) : *ppos) + count;
1160 if (end > ll_file_maxbytes(inode)) {
1162 CDEBUG(D_INODE, "%s: file "DFID" offset %llu > maxbytes "LPU64
1163 ": rc = %zd\n", ll_get_fsname(inode->i_sb, NULL, 0),
1164 PFID(&lli->lli_fid), end, ll_file_maxbytes(inode),
1169 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1170 struct vvp_io *vio = vvp_env_io(env);
1171 bool range_locked = false;
1173 if (file->f_flags & O_APPEND)
1174 range_lock_init(&range, 0, LUSTRE_EOF);
1176 range_lock_init(&range, *ppos, *ppos + count - 1);
1178 vio->vui_fd = LUSTRE_FPRIVATE(file);
1179 vio->vui_io_subtype = args->via_io_subtype;
1181 switch (vio->vui_io_subtype) {
1183 vio->vui_iov = args->u.normal.via_iov;
1184 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1185 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1186 vio->vui_iocb = args->u.normal.via_iocb;
1187 if ((iot == CIT_WRITE) &&
1188 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1189 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1191 result = range_lock(&lli->lli_write_tree,
1196 range_locked = true;
1198 down_read(&lli->lli_trunc_sem);
1201 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1202 vio->u.splice.vui_flags = args->u.splice.via_flags;
1205 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1209 ll_cl_add(file, env, io);
1210 result = cl_io_loop(env, io);
1211 ll_cl_remove(file, env);
1213 if (args->via_io_subtype == IO_NORMAL)
1214 up_read(&lli->lli_trunc_sem);
1216 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1218 range_unlock(&lli->lli_write_tree, &range);
1221 /* cl_io_rw_init() handled IO */
1222 result = io->ci_result;
1225 if (io->ci_nob > 0) {
1226 result = io->ci_nob;
1227 *ppos = io->u.ci_wr.wr.crw_pos;
1231 cl_io_fini(env, io);
1232 /* If any bit been read/written (result != 0), we just return
1233 * short read/write instead of restart io. */
1234 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1235 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1236 iot == CIT_READ ? "read" : "write",
1237 file->f_dentry->d_name.name, *ppos, count);
1238 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1242 if (iot == CIT_READ) {
1244 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1245 LPROC_LL_READ_BYTES, result);
1246 } else if (iot == CIT_WRITE) {
1248 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1249 LPROC_LL_WRITE_BYTES, result);
1250 fd->fd_write_failed = false;
1251 } else if (result != -ERESTARTSYS) {
1252 fd->fd_write_failed = true;
1255 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1262 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1264 static int ll_file_get_iov_count(const struct iovec *iov,
1265 unsigned long *nr_segs, size_t *count)
1270 for (seg = 0; seg < *nr_segs; seg++) {
1271 const struct iovec *iv = &iov[seg];
1274 * If any segment has a negative length, or the cumulative
1275 * length ever wraps negative then return -EINVAL.
1278 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1280 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1285 cnt -= iv->iov_len; /* This segment is no good */
1292 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1293 unsigned long nr_segs, loff_t pos)
1296 struct vvp_io_args *args;
1302 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1306 env = cl_env_get(&refcheck);
1308 RETURN(PTR_ERR(env));
1310 args = vvp_env_args(env, IO_NORMAL);
1311 args->u.normal.via_iov = (struct iovec *)iov;
1312 args->u.normal.via_nrsegs = nr_segs;
1313 args->u.normal.via_iocb = iocb;
1315 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1316 &iocb->ki_pos, count);
1317 cl_env_put(env, &refcheck);
1321 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1325 struct iovec *local_iov;
1326 struct kiocb *kiocb;
1331 env = cl_env_get(&refcheck);
1333 RETURN(PTR_ERR(env));
1335 local_iov = &vvp_env_info(env)->vti_local_iov;
1336 kiocb = &vvp_env_info(env)->vti_kiocb;
1337 local_iov->iov_base = (void __user *)buf;
1338 local_iov->iov_len = count;
1339 init_sync_kiocb(kiocb, file);
1340 kiocb->ki_pos = *ppos;
1341 #ifdef HAVE_KIOCB_KI_LEFT
1342 kiocb->ki_left = count;
1344 kiocb->ki_nbytes = count;
1347 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1348 *ppos = kiocb->ki_pos;
1350 cl_env_put(env, &refcheck);
1355 * Write to a file (through the page cache).
1358 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1359 unsigned long nr_segs, loff_t pos)
1362 struct vvp_io_args *args;
1368 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1372 env = cl_env_get(&refcheck);
1374 RETURN(PTR_ERR(env));
1376 args = vvp_env_args(env, IO_NORMAL);
1377 args->u.normal.via_iov = (struct iovec *)iov;
1378 args->u.normal.via_nrsegs = nr_segs;
1379 args->u.normal.via_iocb = iocb;
1381 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1382 &iocb->ki_pos, count);
1383 cl_env_put(env, &refcheck);
1387 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1388 size_t count, loff_t *ppos)
1391 struct iovec *local_iov;
1392 struct kiocb *kiocb;
1397 env = cl_env_get(&refcheck);
1399 RETURN(PTR_ERR(env));
1401 local_iov = &vvp_env_info(env)->vti_local_iov;
1402 kiocb = &vvp_env_info(env)->vti_kiocb;
1403 local_iov->iov_base = (void __user *)buf;
1404 local_iov->iov_len = count;
1405 init_sync_kiocb(kiocb, file);
1406 kiocb->ki_pos = *ppos;
1407 #ifdef HAVE_KIOCB_KI_LEFT
1408 kiocb->ki_left = count;
1410 kiocb->ki_nbytes = count;
1413 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1414 *ppos = kiocb->ki_pos;
1416 cl_env_put(env, &refcheck);
1421 * Send file content (through pagecache) somewhere with helper
1423 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1424 struct pipe_inode_info *pipe, size_t count,
1428 struct vvp_io_args *args;
1433 env = cl_env_get(&refcheck);
1435 RETURN(PTR_ERR(env));
1437 args = vvp_env_args(env, IO_SPLICE);
1438 args->u.splice.via_pipe = pipe;
1439 args->u.splice.via_flags = flags;
1441 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1442 cl_env_put(env, &refcheck);
1446 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1447 __u64 flags, struct lov_user_md *lum,
1450 struct lov_stripe_md *lsm = NULL;
1451 struct lookup_intent oit = {
1453 .it_flags = flags | MDS_OPEN_BY_FID,
1458 lsm = ccc_inode_lsm_get(inode);
1460 ccc_inode_lsm_put(inode, lsm);
1461 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1462 PFID(ll_inode2fid(inode)));
1463 GOTO(out, rc = -EEXIST);
1466 ll_inode_size_lock(inode);
1467 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1469 GOTO(out_unlock, rc);
1471 rc = oit.d.lustre.it_status;
1473 GOTO(out_unlock, rc);
1475 ll_release_openhandle(file->f_dentry, &oit);
1478 ll_inode_size_unlock(inode);
1479 ll_intent_release(&oit);
1480 ccc_inode_lsm_put(inode, lsm);
1482 cl_lov_delay_create_clear(&file->f_flags);
1487 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1488 struct lov_mds_md **lmmp, int *lmm_size,
1489 struct ptlrpc_request **request)
1491 struct ll_sb_info *sbi = ll_i2sbi(inode);
1492 struct mdt_body *body;
1493 struct lov_mds_md *lmm = NULL;
1494 struct ptlrpc_request *req = NULL;
1495 struct md_op_data *op_data;
1498 rc = ll_get_default_mdsize(sbi, &lmmsize);
1502 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1503 strlen(filename), lmmsize,
1504 LUSTRE_OPC_ANY, NULL);
1505 if (IS_ERR(op_data))
1506 RETURN(PTR_ERR(op_data));
1508 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1509 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1510 ll_finish_md_op_data(op_data);
1512 CDEBUG(D_INFO, "md_getattr_name failed "
1513 "on %s: rc %d\n", filename, rc);
1517 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1518 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1520 lmmsize = body->mbo_eadatasize;
1522 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1524 GOTO(out, rc = -ENODATA);
1527 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1528 LASSERT(lmm != NULL);
1530 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1531 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1532 GOTO(out, rc = -EPROTO);
1536 * This is coming from the MDS, so is probably in
1537 * little endian. We convert it to host endian before
1538 * passing it to userspace.
1540 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1543 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1544 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1547 /* if function called for directory - we should
1548 * avoid swab not existent lsm objects */
1549 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1550 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1551 if (S_ISREG(body->mbo_mode))
1552 lustre_swab_lov_user_md_objects(
1553 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1555 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1556 lustre_swab_lov_user_md_v3(
1557 (struct lov_user_md_v3 *)lmm);
1558 if (S_ISREG(body->mbo_mode))
1559 lustre_swab_lov_user_md_objects(
1560 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1567 *lmm_size = lmmsize;
1572 static int ll_lov_setea(struct inode *inode, struct file *file,
1575 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1576 struct lov_user_md *lump;
1577 int lum_size = sizeof(struct lov_user_md) +
1578 sizeof(struct lov_user_ost_data);
1582 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1585 OBD_ALLOC_LARGE(lump, lum_size);
1589 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1590 OBD_FREE_LARGE(lump, lum_size);
1594 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1596 OBD_FREE_LARGE(lump, lum_size);
1600 static int ll_file_getstripe(struct inode *inode,
1601 struct lov_user_md __user *lum)
1608 env = cl_env_get(&refcheck);
1610 RETURN(PTR_ERR(env));
1612 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1613 cl_env_put(env, &refcheck);
1617 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1620 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1621 struct lov_user_md *klum;
1623 __u64 flags = FMODE_WRITE;
1626 rc = ll_copy_user_md(lum, &klum);
1631 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1635 put_user(0, &lum->lmm_stripe_count);
1637 ll_layout_refresh(inode, &gen);
1638 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1641 OBD_FREE(klum, lum_size);
1646 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1648 struct ll_inode_info *lli = ll_i2info(inode);
1649 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1650 struct ccc_grouplock grouplock;
1655 CWARN("group id for group lock must not be 0\n");
1659 if (ll_file_nolock(file))
1660 RETURN(-EOPNOTSUPP);
1662 spin_lock(&lli->lli_lock);
1663 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1664 CWARN("group lock already existed with gid %lu\n",
1665 fd->fd_grouplock.cg_gid);
1666 spin_unlock(&lli->lli_lock);
1669 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1670 spin_unlock(&lli->lli_lock);
1672 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1673 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1677 spin_lock(&lli->lli_lock);
1678 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1679 spin_unlock(&lli->lli_lock);
1680 CERROR("another thread just won the race\n");
1681 cl_put_grouplock(&grouplock);
1685 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1686 fd->fd_grouplock = grouplock;
1687 spin_unlock(&lli->lli_lock);
1689 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1693 static int ll_put_grouplock(struct inode *inode, struct file *file,
1696 struct ll_inode_info *lli = ll_i2info(inode);
1697 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1698 struct ccc_grouplock grouplock;
1701 spin_lock(&lli->lli_lock);
1702 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1703 spin_unlock(&lli->lli_lock);
1704 CWARN("no group lock held\n");
1707 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1709 if (fd->fd_grouplock.cg_gid != arg) {
1710 CWARN("group lock %lu doesn't match current id %lu\n",
1711 arg, fd->fd_grouplock.cg_gid);
1712 spin_unlock(&lli->lli_lock);
1716 grouplock = fd->fd_grouplock;
1717 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1718 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1719 spin_unlock(&lli->lli_lock);
1721 cl_put_grouplock(&grouplock);
1722 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1727 * Close inode open handle
1729 * \param dentry [in] dentry which contains the inode
1730 * \param it [in,out] intent which contains open info and result
1733 * \retval <0 failure
1735 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1737 struct inode *inode = dentry->d_inode;
1738 struct obd_client_handle *och;
1744 /* Root ? Do nothing. */
1745 if (dentry->d_inode->i_sb->s_root == dentry)
1748 /* No open handle to close? Move away */
1749 if (!it_disposition(it, DISP_OPEN_OPEN))
1752 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1754 OBD_ALLOC(och, sizeof(*och));
1756 GOTO(out, rc = -ENOMEM);
1758 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1760 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1763 /* this one is in place of ll_file_open */
1764 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1765 ptlrpc_req_finished(it->d.lustre.it_data);
1766 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1772 * Get size for inode for which FIEMAP mapping is requested.
1773 * Make the FIEMAP get_info call and returns the result.
1775 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1778 struct obd_export *exp = ll_i2dtexp(inode);
1779 struct lov_stripe_md *lsm = NULL;
1780 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1781 __u32 vallen = num_bytes;
1785 /* Checks for fiemap flags */
1786 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1787 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1791 /* Check for FIEMAP_FLAG_SYNC */
1792 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1793 rc = filemap_fdatawrite(inode->i_mapping);
1798 lsm = ccc_inode_lsm_get(inode);
1802 /* If the stripe_count > 1 and the application does not understand
1803 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1805 if (lsm->lsm_stripe_count > 1 &&
1806 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1807 GOTO(out, rc = -EOPNOTSUPP);
1809 fm_key.oa.o_oi = lsm->lsm_oi;
1810 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1812 if (i_size_read(inode) == 0) {
1813 rc = ll_glimpse_size(inode);
1818 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1819 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1820 /* If filesize is 0, then there would be no objects for mapping */
1821 if (fm_key.oa.o_size == 0) {
1822 fiemap->fm_mapped_extents = 0;
1826 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1828 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1831 CERROR("obd_get_info failed: rc = %d\n", rc);
1834 ccc_inode_lsm_put(inode, lsm);
1838 int ll_fid2path(struct inode *inode, void __user *arg)
1840 struct obd_export *exp = ll_i2mdexp(inode);
1841 const struct getinfo_fid2path __user *gfin = arg;
1843 struct getinfo_fid2path *gfout;
1849 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1850 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1853 /* Only need to get the buflen */
1854 if (get_user(pathlen, &gfin->gf_pathlen))
1857 if (pathlen > PATH_MAX)
1860 outsize = sizeof(*gfout) + pathlen;
1861 OBD_ALLOC(gfout, outsize);
1865 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1866 GOTO(gf_free, rc = -EFAULT);
1868 /* Call mdc_iocontrol */
1869 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1873 if (copy_to_user(arg, gfout, outsize))
1877 OBD_FREE(gfout, outsize);
1881 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1883 struct ll_user_fiemap *fiemap_s;
1884 size_t num_bytes, ret_bytes;
1885 unsigned int extent_count;
1888 /* Get the extent count so we can calculate the size of
1889 * required fiemap buffer */
1890 if (get_user(extent_count,
1891 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1895 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1897 num_bytes = sizeof(*fiemap_s) + (extent_count *
1898 sizeof(struct ll_fiemap_extent));
1900 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1901 if (fiemap_s == NULL)
1904 /* get the fiemap value */
1905 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1907 GOTO(error, rc = -EFAULT);
1909 /* If fm_extent_count is non-zero, read the first extent since
1910 * it is used to calculate end_offset and device from previous
1913 if (copy_from_user(&fiemap_s->fm_extents[0],
1914 (char __user *)arg + sizeof(*fiemap_s),
1915 sizeof(struct ll_fiemap_extent)))
1916 GOTO(error, rc = -EFAULT);
1919 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1923 ret_bytes = sizeof(struct ll_user_fiemap);
1925 if (extent_count != 0)
1926 ret_bytes += (fiemap_s->fm_mapped_extents *
1927 sizeof(struct ll_fiemap_extent));
1929 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1933 OBD_FREE_LARGE(fiemap_s, num_bytes);
1938 * Read the data_version for inode.
1940 * This value is computed using stripe object version on OST.
1941 * Version is computed using server side locking.
1943 * @param sync if do sync on the OST side;
1945 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1946 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1948 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1950 struct lov_stripe_md *lsm = NULL;
1951 struct ll_sb_info *sbi = ll_i2sbi(inode);
1952 struct obdo *obdo = NULL;
1956 /* If no stripe, we consider version is 0. */
1957 lsm = ccc_inode_lsm_get(inode);
1958 if (!lsm_has_objects(lsm)) {
1960 CDEBUG(D_INODE, "No object for inode\n");
1964 OBD_ALLOC_PTR(obdo);
1966 GOTO(out, rc = -ENOMEM);
1968 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1970 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1973 *data_version = obdo->o_data_version;
1979 ccc_inode_lsm_put(inode, lsm);
1984 * Trigger a HSM release request for the provided inode.
1986 int ll_hsm_release(struct inode *inode)
1988 struct cl_env_nest nest;
1990 struct obd_client_handle *och = NULL;
1991 __u64 data_version = 0;
1995 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1996 ll_get_fsname(inode->i_sb, NULL, 0),
1997 PFID(&ll_i2info(inode)->lli_fid));
1999 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2001 GOTO(out, rc = PTR_ERR(och));
2003 /* Grab latest data_version and [am]time values */
2004 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2008 env = cl_env_nested_get(&nest);
2010 GOTO(out, rc = PTR_ERR(env));
2012 ll_merge_attr(env, inode);
2013 cl_env_nested_put(&nest, env);
2015 /* Release the file.
2016 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2017 * we still need it to pack l_remote_handle to MDT. */
2018 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2024 if (och != NULL && !IS_ERR(och)) /* close the file */
2025 ll_lease_close(och, inode, NULL);
2030 struct ll_swap_stack {
2031 struct iattr ia1, ia2;
2033 struct inode *inode1, *inode2;
2034 bool check_dv1, check_dv2;
2037 static int ll_swap_layouts(struct file *file1, struct file *file2,
2038 struct lustre_swap_layouts *lsl)
2040 struct mdc_swap_layouts msl;
2041 struct md_op_data *op_data;
2044 struct ll_swap_stack *llss = NULL;
2047 OBD_ALLOC_PTR(llss);
2051 llss->inode1 = file1->f_dentry->d_inode;
2052 llss->inode2 = file2->f_dentry->d_inode;
2054 if (!S_ISREG(llss->inode2->i_mode))
2055 GOTO(free, rc = -EINVAL);
2057 if (inode_permission(llss->inode1, MAY_WRITE) ||
2058 inode_permission(llss->inode2, MAY_WRITE))
2059 GOTO(free, rc = -EPERM);
2061 if (llss->inode2->i_sb != llss->inode1->i_sb)
2062 GOTO(free, rc = -EXDEV);
2064 /* we use 2 bool because it is easier to swap than 2 bits */
2065 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2066 llss->check_dv1 = true;
2068 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2069 llss->check_dv2 = true;
2071 /* we cannot use lsl->sl_dvX directly because we may swap them */
2072 llss->dv1 = lsl->sl_dv1;
2073 llss->dv2 = lsl->sl_dv2;
2075 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2076 if (rc == 0) /* same file, done! */
2079 if (rc < 0) { /* sequentialize it */
2080 swap(llss->inode1, llss->inode2);
2082 swap(llss->dv1, llss->dv2);
2083 swap(llss->check_dv1, llss->check_dv2);
2087 if (gid != 0) { /* application asks to flush dirty cache */
2088 rc = ll_get_grouplock(llss->inode1, file1, gid);
2092 rc = ll_get_grouplock(llss->inode2, file2, gid);
2094 ll_put_grouplock(llss->inode1, file1, gid);
2099 /* to be able to restore mtime and atime after swap
2100 * we need to first save them */
2102 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2103 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2104 llss->ia1.ia_atime = llss->inode1->i_atime;
2105 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2106 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2107 llss->ia2.ia_atime = llss->inode2->i_atime;
2108 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2111 /* ultimate check, before swaping the layouts we check if
2112 * dataversion has changed (if requested) */
2113 if (llss->check_dv1) {
2114 rc = ll_data_version(llss->inode1, &dv, 0);
2117 if (dv != llss->dv1)
2118 GOTO(putgl, rc = -EAGAIN);
2121 if (llss->check_dv2) {
2122 rc = ll_data_version(llss->inode2, &dv, 0);
2125 if (dv != llss->dv2)
2126 GOTO(putgl, rc = -EAGAIN);
2129 /* struct md_op_data is used to send the swap args to the mdt
2130 * only flags is missing, so we use struct mdc_swap_layouts
2131 * through the md_op_data->op_data */
2132 /* flags from user space have to be converted before they are send to
2133 * server, no flag is sent today, they are only used on the client */
2136 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2137 0, LUSTRE_OPC_ANY, &msl);
2138 if (IS_ERR(op_data))
2139 GOTO(free, rc = PTR_ERR(op_data));
2141 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2142 sizeof(*op_data), op_data, NULL);
2143 ll_finish_md_op_data(op_data);
2147 ll_put_grouplock(llss->inode2, file2, gid);
2148 ll_put_grouplock(llss->inode1, file1, gid);
2151 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2155 /* clear useless flags */
2156 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2157 llss->ia1.ia_valid &= ~ATTR_MTIME;
2158 llss->ia2.ia_valid &= ~ATTR_MTIME;
2161 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2162 llss->ia1.ia_valid &= ~ATTR_ATIME;
2163 llss->ia2.ia_valid &= ~ATTR_ATIME;
2166 /* update time if requested */
2168 if (llss->ia2.ia_valid != 0) {
2169 mutex_lock(&llss->inode1->i_mutex);
2170 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2171 mutex_unlock(&llss->inode1->i_mutex);
2174 if (llss->ia1.ia_valid != 0) {
2177 mutex_lock(&llss->inode2->i_mutex);
2178 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2179 mutex_unlock(&llss->inode2->i_mutex);
2191 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2193 struct md_op_data *op_data;
2196 /* Non-root users are forbidden to set or clear flags which are
2197 * NOT defined in HSM_USER_MASK. */
2198 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2199 !cfs_capable(CFS_CAP_SYS_ADMIN))
2202 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2203 LUSTRE_OPC_ANY, hss);
2204 if (IS_ERR(op_data))
2205 RETURN(PTR_ERR(op_data));
2207 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2208 sizeof(*op_data), op_data, NULL);
2210 ll_finish_md_op_data(op_data);
2215 static int ll_hsm_import(struct inode *inode, struct file *file,
2216 struct hsm_user_import *hui)
2218 struct hsm_state_set *hss = NULL;
2219 struct iattr *attr = NULL;
2223 if (!S_ISREG(inode->i_mode))
2229 GOTO(out, rc = -ENOMEM);
2231 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2232 hss->hss_archive_id = hui->hui_archive_id;
2233 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2234 rc = ll_hsm_state_set(inode, hss);
2238 OBD_ALLOC_PTR(attr);
2240 GOTO(out, rc = -ENOMEM);
2242 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2243 attr->ia_mode |= S_IFREG;
2244 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2245 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2246 attr->ia_size = hui->hui_size;
2247 attr->ia_mtime.tv_sec = hui->hui_mtime;
2248 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2249 attr->ia_atime.tv_sec = hui->hui_atime;
2250 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2252 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2253 ATTR_UID | ATTR_GID |
2254 ATTR_MTIME | ATTR_MTIME_SET |
2255 ATTR_ATIME | ATTR_ATIME_SET;
2257 mutex_lock(&inode->i_mutex);
2259 rc = ll_setattr_raw(file->f_dentry, attr, true);
2263 mutex_unlock(&inode->i_mutex);
2275 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2277 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2278 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2282 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2284 struct inode *inode = file->f_dentry->d_inode;
2285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2289 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2290 PFID(ll_inode2fid(inode)), inode, cmd);
2291 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2293 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2294 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2298 case LL_IOC_GETFLAGS:
2299 /* Get the current value of the file flags */
2300 return put_user(fd->fd_flags, (int __user *)arg);
2301 case LL_IOC_SETFLAGS:
2302 case LL_IOC_CLRFLAGS:
2303 /* Set or clear specific file flags */
2304 /* XXX This probably needs checks to ensure the flags are
2305 * not abused, and to handle any flag side effects.
2307 if (get_user(flags, (int __user *) arg))
2310 if (cmd == LL_IOC_SETFLAGS) {
2311 if ((flags & LL_FILE_IGNORE_LOCK) &&
2312 !(file->f_flags & O_DIRECT)) {
2313 CERROR("%s: unable to disable locking on "
2314 "non-O_DIRECT file\n", current->comm);
2318 fd->fd_flags |= flags;
2320 fd->fd_flags &= ~flags;
2323 case LL_IOC_LOV_SETSTRIPE:
2324 RETURN(ll_lov_setstripe(inode, file, arg));
2325 case LL_IOC_LOV_SETEA:
2326 RETURN(ll_lov_setea(inode, file, arg));
2327 case LL_IOC_LOV_SWAP_LAYOUTS: {
2329 struct lustre_swap_layouts lsl;
2331 if (copy_from_user(&lsl, (char __user *)arg,
2332 sizeof(struct lustre_swap_layouts)))
2335 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2338 file2 = fget(lsl.sl_fd);
2343 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2344 rc = ll_swap_layouts(file, file2, &lsl);
2348 case LL_IOC_LOV_GETSTRIPE:
2349 RETURN(ll_file_getstripe(inode,
2350 (struct lov_user_md __user *)arg));
2351 case FSFILT_IOC_FIEMAP:
2352 RETURN(ll_ioctl_fiemap(inode, arg));
2353 case FSFILT_IOC_GETFLAGS:
2354 case FSFILT_IOC_SETFLAGS:
2355 RETURN(ll_iocontrol(inode, file, cmd, arg));
2356 case FSFILT_IOC_GETVERSION_OLD:
2357 case FSFILT_IOC_GETVERSION:
2358 RETURN(put_user(inode->i_generation, (int __user *)arg));
2359 case LL_IOC_GROUP_LOCK:
2360 RETURN(ll_get_grouplock(inode, file, arg));
2361 case LL_IOC_GROUP_UNLOCK:
2362 RETURN(ll_put_grouplock(inode, file, arg));
2363 case IOC_OBD_STATFS:
2364 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2366 /* We need to special case any other ioctls we want to handle,
2367 * to send them to the MDS/OST as appropriate and to properly
2368 * network encode the arg field.
2369 case FSFILT_IOC_SETVERSION_OLD:
2370 case FSFILT_IOC_SETVERSION:
2372 case LL_IOC_FLUSHCTX:
2373 RETURN(ll_flush_ctx(inode));
2374 case LL_IOC_PATH2FID: {
2375 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2376 sizeof(struct lu_fid)))
2381 case LL_IOC_GETPARENT:
2382 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2384 case OBD_IOC_FID2PATH:
2385 RETURN(ll_fid2path(inode, (void __user *)arg));
2386 case LL_IOC_DATA_VERSION: {
2387 struct ioc_data_version idv;
2390 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2393 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2394 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2397 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2403 case LL_IOC_GET_MDTIDX: {
2406 mdtidx = ll_get_mdt_idx(inode);
2410 if (put_user((int)mdtidx, (int __user *)arg))
2415 case OBD_IOC_GETDTNAME:
2416 case OBD_IOC_GETMDNAME:
2417 RETURN(ll_get_obd_name(inode, cmd, arg));
2418 case LL_IOC_HSM_STATE_GET: {
2419 struct md_op_data *op_data;
2420 struct hsm_user_state *hus;
2427 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2428 LUSTRE_OPC_ANY, hus);
2429 if (IS_ERR(op_data)) {
2431 RETURN(PTR_ERR(op_data));
2434 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2437 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2440 ll_finish_md_op_data(op_data);
2444 case LL_IOC_HSM_STATE_SET: {
2445 struct hsm_state_set *hss;
2452 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2457 rc = ll_hsm_state_set(inode, hss);
2462 case LL_IOC_HSM_ACTION: {
2463 struct md_op_data *op_data;
2464 struct hsm_current_action *hca;
2471 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2472 LUSTRE_OPC_ANY, hca);
2473 if (IS_ERR(op_data)) {
2475 RETURN(PTR_ERR(op_data));
2478 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2481 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2484 ll_finish_md_op_data(op_data);
2488 case LL_IOC_SET_LEASE: {
2489 struct ll_inode_info *lli = ll_i2info(inode);
2490 struct obd_client_handle *och = NULL;
2495 case LL_LEASE_WRLCK:
2496 if (!(file->f_mode & FMODE_WRITE))
2498 fmode = FMODE_WRITE;
2500 case LL_LEASE_RDLCK:
2501 if (!(file->f_mode & FMODE_READ))
2505 case LL_LEASE_UNLCK:
2506 mutex_lock(&lli->lli_och_mutex);
2507 if (fd->fd_lease_och != NULL) {
2508 och = fd->fd_lease_och;
2509 fd->fd_lease_och = NULL;
2511 mutex_unlock(&lli->lli_och_mutex);
2516 fmode = och->och_flags;
2517 rc = ll_lease_close(och, inode, &lease_broken);
2524 RETURN(ll_lease_type_from_fmode(fmode));
2529 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2531 /* apply for lease */
2532 och = ll_lease_open(inode, file, fmode, 0);
2534 RETURN(PTR_ERR(och));
2537 mutex_lock(&lli->lli_och_mutex);
2538 if (fd->fd_lease_och == NULL) {
2539 fd->fd_lease_och = och;
2542 mutex_unlock(&lli->lli_och_mutex);
2544 /* impossible now that only excl is supported for now */
2545 ll_lease_close(och, inode, &lease_broken);
2550 case LL_IOC_GET_LEASE: {
2551 struct ll_inode_info *lli = ll_i2info(inode);
2552 struct ldlm_lock *lock = NULL;
2555 mutex_lock(&lli->lli_och_mutex);
2556 if (fd->fd_lease_och != NULL) {
2557 struct obd_client_handle *och = fd->fd_lease_och;
2559 lock = ldlm_handle2lock(&och->och_lease_handle);
2561 lock_res_and_lock(lock);
2562 if (!ldlm_is_cancel(lock))
2563 fmode = och->och_flags;
2565 unlock_res_and_lock(lock);
2566 LDLM_LOCK_PUT(lock);
2569 mutex_unlock(&lli->lli_och_mutex);
2571 RETURN(ll_lease_type_from_fmode(fmode));
2573 case LL_IOC_HSM_IMPORT: {
2574 struct hsm_user_import *hui;
2580 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2585 rc = ll_hsm_import(inode, file, hui);
2595 ll_iocontrol_call(inode, file, cmd, arg, &err))
2598 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2599 (void __user *)arg));
2604 #ifndef HAVE_FILE_LLSEEK_SIZE
2605 static inline loff_t
2606 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2608 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2610 if (offset > maxsize)
2613 if (offset != file->f_pos) {
2614 file->f_pos = offset;
2615 file->f_version = 0;
2621 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2622 loff_t maxsize, loff_t eof)
2624 struct inode *inode = file->f_dentry->d_inode;
2632 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2633 * position-querying operation. Avoid rewriting the "same"
2634 * f_pos value back to the file because a concurrent read(),
2635 * write() or lseek() might have altered it
2640 * f_lock protects against read/modify/write race with other
2641 * SEEK_CURs. Note that parallel writes and reads behave
2644 mutex_lock(&inode->i_mutex);
2645 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2646 mutex_unlock(&inode->i_mutex);
2650 * In the generic case the entire file is data, so as long as
2651 * offset isn't at the end of the file then the offset is data.
2658 * There is a virtual hole at the end of the file, so as long as
2659 * offset isn't i_size or larger, return i_size.
2667 return llseek_execute(file, offset, maxsize);
2671 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2673 struct inode *inode = file->f_dentry->d_inode;
2674 loff_t retval, eof = 0;
2677 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2678 (origin == SEEK_CUR) ? file->f_pos : 0);
2679 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2680 PFID(ll_inode2fid(inode)), inode, retval, retval,
2682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2684 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2685 retval = ll_glimpse_size(inode);
2688 eof = i_size_read(inode);
2691 retval = ll_generic_file_llseek_size(file, offset, origin,
2692 ll_file_maxbytes(inode), eof);
2696 static int ll_flush(struct file *file, fl_owner_t id)
2698 struct inode *inode = file->f_dentry->d_inode;
2699 struct ll_inode_info *lli = ll_i2info(inode);
2700 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2703 LASSERT(!S_ISDIR(inode->i_mode));
2705 /* catch async errors that were recorded back when async writeback
2706 * failed for pages in this mapping. */
2707 rc = lli->lli_async_rc;
2708 lli->lli_async_rc = 0;
2709 if (lli->lli_clob != NULL) {
2710 err = lov_read_and_clear_async_rc(lli->lli_clob);
2715 /* The application has been told write failure already.
2716 * Do not report failure again. */
2717 if (fd->fd_write_failed)
2719 return rc ? -EIO : 0;
2723 * Called to make sure a portion of file has been written out.
2724 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2726 * Return how many pages have been written.
2728 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2729 enum cl_fsync_mode mode, int ignore_layout)
2731 struct cl_env_nest nest;
2734 struct obd_capa *capa = NULL;
2735 struct cl_fsync_io *fio;
2739 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2740 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2743 env = cl_env_nested_get(&nest);
2745 RETURN(PTR_ERR(env));
2747 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2749 io = ccc_env_thread_io(env);
2750 io->ci_obj = ll_i2info(inode)->lli_clob;
2751 io->ci_ignore_layout = ignore_layout;
2753 /* initialize parameters for sync */
2754 fio = &io->u.ci_fsync;
2755 fio->fi_capa = capa;
2756 fio->fi_start = start;
2758 fio->fi_fid = ll_inode2fid(inode);
2759 fio->fi_mode = mode;
2760 fio->fi_nr_written = 0;
2762 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2763 result = cl_io_loop(env, io);
2765 result = io->ci_result;
2767 result = fio->fi_nr_written;
2768 cl_io_fini(env, io);
2769 cl_env_nested_put(&nest, env);
2777 * When dentry is provided (the 'else' case), *file->f_dentry may be
2778 * null and dentry must be used directly rather than pulled from
2779 * *file->f_dentry as is done otherwise.
2782 #ifdef HAVE_FILE_FSYNC_4ARGS
2783 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2785 struct dentry *dentry = file->f_dentry;
2786 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2787 int ll_fsync(struct file *file, int datasync)
2789 struct dentry *dentry = file->f_dentry;
2791 loff_t end = LLONG_MAX;
2793 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2796 loff_t end = LLONG_MAX;
2798 struct inode *inode = dentry->d_inode;
2799 struct ll_inode_info *lli = ll_i2info(inode);
2800 struct ptlrpc_request *req;
2801 struct obd_capa *oc;
2805 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2806 PFID(ll_inode2fid(inode)), inode);
2807 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2809 #ifdef HAVE_FILE_FSYNC_4ARGS
2810 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2811 mutex_lock(&inode->i_mutex);
2813 /* fsync's caller has already called _fdata{sync,write}, we want
2814 * that IO to finish before calling the osc and mdc sync methods */
2815 rc = filemap_fdatawait(inode->i_mapping);
2818 /* catch async errors that were recorded back when async writeback
2819 * failed for pages in this mapping. */
2820 if (!S_ISDIR(inode->i_mode)) {
2821 err = lli->lli_async_rc;
2822 lli->lli_async_rc = 0;
2825 err = lov_read_and_clear_async_rc(lli->lli_clob);
2830 oc = ll_mdscapa_get(inode);
2831 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2837 ptlrpc_req_finished(req);
2839 if (S_ISREG(inode->i_mode)) {
2840 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2842 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2843 if (rc == 0 && err < 0)
2846 fd->fd_write_failed = true;
2848 fd->fd_write_failed = false;
2851 #ifdef HAVE_FILE_FSYNC_4ARGS
2852 mutex_unlock(&inode->i_mutex);
2858 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2860 struct inode *inode = file->f_dentry->d_inode;
2861 struct ll_sb_info *sbi = ll_i2sbi(inode);
2862 struct ldlm_enqueue_info einfo = {
2863 .ei_type = LDLM_FLOCK,
2864 .ei_cb_cp = ldlm_flock_completion_ast,
2865 .ei_cbdata = file_lock,
2867 struct md_op_data *op_data;
2868 struct lustre_handle lockh = {0};
2869 ldlm_policy_data_t flock = {{0}};
2870 int fl_type = file_lock->fl_type;
2876 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2877 PFID(ll_inode2fid(inode)), file_lock);
2879 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2881 if (file_lock->fl_flags & FL_FLOCK) {
2882 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2883 /* flocks are whole-file locks */
2884 flock.l_flock.end = OFFSET_MAX;
2885 /* For flocks owner is determined by the local file desctiptor*/
2886 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2887 } else if (file_lock->fl_flags & FL_POSIX) {
2888 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2889 flock.l_flock.start = file_lock->fl_start;
2890 flock.l_flock.end = file_lock->fl_end;
2894 flock.l_flock.pid = file_lock->fl_pid;
2896 /* Somewhat ugly workaround for svc lockd.
2897 * lockd installs custom fl_lmops->lm_compare_owner that checks
2898 * for the fl_owner to be the same (which it always is on local node
2899 * I guess between lockd processes) and then compares pid.
2900 * As such we assign pid to the owner field to make it all work,
2901 * conflict with normal locks is unlikely since pid space and
2902 * pointer space for current->files are not intersecting */
2903 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2904 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2908 einfo.ei_mode = LCK_PR;
2911 /* An unlock request may or may not have any relation to
2912 * existing locks so we may not be able to pass a lock handle
2913 * via a normal ldlm_lock_cancel() request. The request may even
2914 * unlock a byte range in the middle of an existing lock. In
2915 * order to process an unlock request we need all of the same
2916 * information that is given with a normal read or write record
2917 * lock request. To avoid creating another ldlm unlock (cancel)
2918 * message we'll treat a LCK_NL flock request as an unlock. */
2919 einfo.ei_mode = LCK_NL;
2922 einfo.ei_mode = LCK_PW;
2925 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2940 flags = LDLM_FL_BLOCK_NOWAIT;
2946 flags = LDLM_FL_TEST_LOCK;
2949 CERROR("unknown fcntl lock command: %d\n", cmd);
2953 /* Save the old mode so that if the mode in the lock changes we
2954 * can decrement the appropriate reader or writer refcount. */
2955 file_lock->fl_type = einfo.ei_mode;
2957 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2958 LUSTRE_OPC_ANY, NULL);
2959 if (IS_ERR(op_data))
2960 RETURN(PTR_ERR(op_data));
2962 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2963 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2964 flock.l_flock.pid, flags, einfo.ei_mode,
2965 flock.l_flock.start, flock.l_flock.end);
2967 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2970 /* Restore the file lock type if not TEST lock. */
2971 if (!(flags & LDLM_FL_TEST_LOCK))
2972 file_lock->fl_type = fl_type;
2974 if ((file_lock->fl_flags & FL_FLOCK) &&
2975 (rc == 0 || file_lock->fl_type == F_UNLCK))
2976 rc2 = flock_lock_file_wait(file, file_lock);
2977 if ((file_lock->fl_flags & FL_POSIX) &&
2978 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2979 !(flags & LDLM_FL_TEST_LOCK))
2980 rc2 = posix_lock_file_wait(file, file_lock);
2982 if (rc2 && file_lock->fl_type != F_UNLCK) {
2983 einfo.ei_mode = LCK_NL;
2984 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2989 ll_finish_md_op_data(op_data);
2994 int ll_get_fid_by_name(struct inode *parent, const char *name,
2995 int namelen, struct lu_fid *fid)
2997 struct md_op_data *op_data = NULL;
2998 struct mdt_body *body;
2999 struct ptlrpc_request *req;
3003 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3004 LUSTRE_OPC_ANY, NULL);
3005 if (IS_ERR(op_data))
3006 RETURN(PTR_ERR(op_data));
3008 op_data->op_valid = OBD_MD_FLID;
3009 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3010 ll_finish_md_op_data(op_data);
3014 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3016 GOTO(out_req, rc = -EFAULT);
3018 *fid = body->mbo_fid1;
3020 ptlrpc_req_finished(req);
3024 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3025 const char *name, int namelen)
3027 struct dentry *dchild = NULL;
3028 struct inode *child_inode = NULL;
3029 struct md_op_data *op_data;
3030 struct ptlrpc_request *request = NULL;
3035 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3036 name, PFID(ll_inode2fid(parent)), mdtidx);
3038 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3039 0, LUSTRE_OPC_ANY, NULL);
3040 if (IS_ERR(op_data))
3041 RETURN(PTR_ERR(op_data));
3043 /* Get child FID first */
3044 qstr.hash = full_name_hash(name, namelen);
3047 dchild = d_lookup(file->f_dentry, &qstr);
3048 if (dchild != NULL) {
3049 if (dchild->d_inode != NULL) {
3050 child_inode = igrab(dchild->d_inode);
3051 if (child_inode != NULL) {
3052 mutex_lock(&child_inode->i_mutex);
3053 op_data->op_fid3 = *ll_inode2fid(child_inode);
3054 ll_invalidate_aliases(child_inode);
3059 rc = ll_get_fid_by_name(parent, name, namelen,
3065 if (!fid_is_sane(&op_data->op_fid3)) {
3066 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3067 ll_get_fsname(parent->i_sb, NULL, 0), name,
3068 PFID(&op_data->op_fid3));
3069 GOTO(out_free, rc = -EINVAL);
3072 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3077 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3078 PFID(&op_data->op_fid3), mdtidx);
3079 GOTO(out_free, rc = 0);
3082 op_data->op_mds = mdtidx;
3083 op_data->op_cli_flags = CLI_MIGRATE;
3084 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3085 namelen, name, namelen, &request);
3087 ll_update_times(request, parent);
3089 ptlrpc_req_finished(request);
3094 if (child_inode != NULL) {
3095 clear_nlink(child_inode);
3096 mutex_unlock(&child_inode->i_mutex);
3100 ll_finish_md_op_data(op_data);
3105 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3113 * test if some locks matching bits and l_req_mode are acquired
3114 * - bits can be in different locks
3115 * - if found clear the common lock bits in *bits
3116 * - the bits not found, are kept in *bits
3118 * \param bits [IN] searched lock bits [IN]
3119 * \param l_req_mode [IN] searched lock mode
3120 * \retval boolean, true iff all bits are found
3122 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3124 struct lustre_handle lockh;
3125 ldlm_policy_data_t policy;
3126 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3127 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3136 fid = &ll_i2info(inode)->lli_fid;
3137 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3138 ldlm_lockname[mode]);
3140 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3141 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3142 policy.l_inodebits.bits = *bits & (1 << i);
3143 if (policy.l_inodebits.bits == 0)
3146 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3147 &policy, mode, &lockh)) {
3148 struct ldlm_lock *lock;
3150 lock = ldlm_handle2lock(&lockh);
3153 ~(lock->l_policy_data.l_inodebits.bits);
3154 LDLM_LOCK_PUT(lock);
3156 *bits &= ~policy.l_inodebits.bits;
3163 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3164 struct lustre_handle *lockh, __u64 flags,
3167 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3172 fid = &ll_i2info(inode)->lli_fid;
3173 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3175 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3176 fid, LDLM_IBITS, &policy, mode, lockh);
3181 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3183 /* Already unlinked. Just update nlink and return success */
3184 if (rc == -ENOENT) {
3186 /* This path cannot be hit for regular files unless in
3187 * case of obscure races, so no need to to validate
3189 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3191 } else if (rc != 0) {
3192 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3193 "%s: revalidate FID "DFID" error: rc = %d\n",
3194 ll_get_fsname(inode->i_sb, NULL, 0),
3195 PFID(ll_inode2fid(inode)), rc);
3201 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3203 struct inode *inode = dentry->d_inode;
3204 struct ptlrpc_request *req = NULL;
3205 struct obd_export *exp;
3209 LASSERT(inode != NULL);
3211 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3212 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3214 exp = ll_i2mdexp(inode);
3216 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3217 * But under CMD case, it caused some lock issues, should be fixed
3218 * with new CMD ibits lock. See bug 12718 */
3219 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3220 struct lookup_intent oit = { .it_op = IT_GETATTR };
3221 struct md_op_data *op_data;
3223 if (ibits == MDS_INODELOCK_LOOKUP)
3224 oit.it_op = IT_LOOKUP;
3226 /* Call getattr by fid, so do not provide name at all. */
3227 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3228 dentry->d_inode, NULL, 0, 0,
3229 LUSTRE_OPC_ANY, NULL);
3230 if (IS_ERR(op_data))
3231 RETURN(PTR_ERR(op_data));
3233 rc = md_intent_lock(exp, op_data, &oit, &req,
3234 &ll_md_blocking_ast, 0);
3235 ll_finish_md_op_data(op_data);
3237 rc = ll_inode_revalidate_fini(inode, rc);
3241 rc = ll_revalidate_it_finish(req, &oit, dentry);
3243 ll_intent_release(&oit);
3247 /* Unlinked? Unhash dentry, so it is not picked up later by
3248 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3249 here to preserve get_cwd functionality on 2.6.
3251 if (!dentry->d_inode->i_nlink)
3252 d_lustre_invalidate(dentry, 0);
3254 ll_lookup_finish_locks(&oit, dentry);
3255 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3256 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3257 u64 valid = OBD_MD_FLGETATTR;
3258 struct md_op_data *op_data;
3261 if (S_ISREG(inode->i_mode)) {
3262 rc = ll_get_default_mdsize(sbi, &ealen);
3265 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3268 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3269 0, ealen, LUSTRE_OPC_ANY,
3271 if (IS_ERR(op_data))
3272 RETURN(PTR_ERR(op_data));
3274 op_data->op_valid = valid;
3275 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3276 * capa for this inode. Because we only keep capas of dirs
3278 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3279 ll_finish_md_op_data(op_data);
3281 rc = ll_inode_revalidate_fini(inode, rc);
3285 rc = ll_prep_inode(&inode, req, NULL, NULL);
3288 ptlrpc_req_finished(req);
3292 static int ll_merge_md_attr(struct inode *inode)
3294 struct cl_attr attr = { 0 };
3297 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3298 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3299 &attr, ll_md_blocking_ast);
3303 set_nlink(inode, attr.cat_nlink);
3304 inode->i_blocks = attr.cat_blocks;
3305 i_size_write(inode, attr.cat_size);
3307 ll_i2info(inode)->lli_atime = attr.cat_atime;
3308 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3309 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3315 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3317 struct inode *inode = dentry->d_inode;
3321 rc = __ll_inode_revalidate(dentry, ibits);
3325 /* if object isn't regular file, don't validate size */
3326 if (!S_ISREG(inode->i_mode)) {
3327 if (S_ISDIR(inode->i_mode) &&
3328 ll_i2info(inode)->lli_lsm_md != NULL) {
3329 rc = ll_merge_md_attr(inode);
3334 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3335 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3336 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3338 /* In case of restore, the MDT has the right size and has
3339 * already send it back without granting the layout lock,
3340 * inode is up-to-date so glimpse is useless.
3341 * Also to glimpse we need the layout, in case of a running
3342 * restore the MDT holds the layout lock so the glimpse will
3343 * block up to the end of restore (getattr will block)
3345 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3346 rc = ll_glimpse_size(inode);
3351 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3353 struct inode *inode = de->d_inode;
3354 struct ll_sb_info *sbi = ll_i2sbi(inode);
3355 struct ll_inode_info *lli = ll_i2info(inode);
3358 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3359 MDS_INODELOCK_LOOKUP);
3360 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3365 stat->dev = inode->i_sb->s_dev;
3366 if (ll_need_32bit_api(sbi))
3367 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3369 stat->ino = inode->i_ino;
3370 stat->mode = inode->i_mode;
3371 stat->uid = inode->i_uid;
3372 stat->gid = inode->i_gid;
3373 stat->rdev = inode->i_rdev;
3374 stat->atime = inode->i_atime;
3375 stat->mtime = inode->i_mtime;
3376 stat->ctime = inode->i_ctime;
3377 stat->blksize = 1 << inode->i_blkbits;
3379 stat->nlink = inode->i_nlink;
3380 stat->size = i_size_read(inode);
3381 stat->blocks = inode->i_blocks;
3386 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3387 __u64 start, __u64 len)
3391 struct ll_user_fiemap *fiemap;
3392 unsigned int extent_count = fieinfo->fi_extents_max;
3394 num_bytes = sizeof(*fiemap) + (extent_count *
3395 sizeof(struct ll_fiemap_extent));
3396 OBD_ALLOC_LARGE(fiemap, num_bytes);
3401 fiemap->fm_flags = fieinfo->fi_flags;
3402 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3403 fiemap->fm_start = start;
3404 fiemap->fm_length = len;
3405 if (extent_count > 0)
3406 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3407 sizeof(struct ll_fiemap_extent));
3409 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3411 fieinfo->fi_flags = fiemap->fm_flags;
3412 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3413 if (extent_count > 0)
3414 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3415 fiemap->fm_mapped_extents *
3416 sizeof(struct ll_fiemap_extent));
3418 OBD_FREE_LARGE(fiemap, num_bytes);
3422 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3424 struct ll_inode_info *lli = ll_i2info(inode);
3425 struct posix_acl *acl = NULL;
3428 spin_lock(&lli->lli_lock);
3429 /* VFS' acl_permission_check->check_acl will release the refcount */
3430 acl = posix_acl_dup(lli->lli_posix_acl);
3431 spin_unlock(&lli->lli_lock);
3436 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3438 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3439 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3441 ll_check_acl(struct inode *inode, int mask)
3444 # ifdef CONFIG_FS_POSIX_ACL
3445 struct posix_acl *acl;
3449 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3450 if (flags & IPERM_FLAG_RCU)
3453 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3458 rc = posix_acl_permission(inode, acl, mask);
3459 posix_acl_release(acl);
3462 # else /* !CONFIG_FS_POSIX_ACL */
3464 # endif /* CONFIG_FS_POSIX_ACL */
3466 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3468 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3469 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3471 # ifdef HAVE_INODE_PERMISION_2ARGS
3472 int ll_inode_permission(struct inode *inode, int mask)
3474 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3479 struct ll_sb_info *sbi;
3480 struct root_squash_info *squash;
3481 struct cred *cred = NULL;
3482 const struct cred *old_cred = NULL;
3484 bool squash_id = false;
3487 #ifdef MAY_NOT_BLOCK
3488 if (mask & MAY_NOT_BLOCK)
3490 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3491 if (flags & IPERM_FLAG_RCU)
3495 /* as root inode are NOT getting validated in lookup operation,
3496 * need to do it before permission check. */
3498 if (inode == inode->i_sb->s_root->d_inode) {
3499 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3500 MDS_INODELOCK_LOOKUP);
3505 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3506 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3508 /* squash fsuid/fsgid if needed */
3509 sbi = ll_i2sbi(inode);
3510 squash = &sbi->ll_squash;
3511 if (unlikely(squash->rsi_uid != 0 &&
3512 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3513 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3517 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3518 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3519 squash->rsi_uid, squash->rsi_gid);
3521 /* update current process's credentials
3522 * and FS capability */
3523 cred = prepare_creds();
3527 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3528 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3529 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3530 if ((1 << cap) & CFS_CAP_FS_MASK)
3531 cap_lower(cred->cap_effective, cap);
3533 old_cred = override_creds(cred);
3536 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3538 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3539 rc = lustre_check_remote_perm(inode, mask);
3541 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3543 /* restore current process's credentials and FS capability */
3545 revert_creds(old_cred);
3552 /* -o localflock - only provides locally consistent flock locks */
3553 struct file_operations ll_file_operations = {
3554 .read = ll_file_read,
3555 .aio_read = ll_file_aio_read,
3556 .write = ll_file_write,
3557 .aio_write = ll_file_aio_write,
3558 .unlocked_ioctl = ll_file_ioctl,
3559 .open = ll_file_open,
3560 .release = ll_file_release,
3561 .mmap = ll_file_mmap,
3562 .llseek = ll_file_seek,
3563 .splice_read = ll_file_splice_read,
3568 struct file_operations ll_file_operations_flock = {
3569 .read = ll_file_read,
3570 .aio_read = ll_file_aio_read,
3571 .write = ll_file_write,
3572 .aio_write = ll_file_aio_write,
3573 .unlocked_ioctl = ll_file_ioctl,
3574 .open = ll_file_open,
3575 .release = ll_file_release,
3576 .mmap = ll_file_mmap,
3577 .llseek = ll_file_seek,
3578 .splice_read = ll_file_splice_read,
3581 .flock = ll_file_flock,
3582 .lock = ll_file_flock
3585 /* These are for -o noflock - to return ENOSYS on flock calls */
3586 struct file_operations ll_file_operations_noflock = {
3587 .read = ll_file_read,
3588 .aio_read = ll_file_aio_read,
3589 .write = ll_file_write,
3590 .aio_write = ll_file_aio_write,
3591 .unlocked_ioctl = ll_file_ioctl,
3592 .open = ll_file_open,
3593 .release = ll_file_release,
3594 .mmap = ll_file_mmap,
3595 .llseek = ll_file_seek,
3596 .splice_read = ll_file_splice_read,
3599 .flock = ll_file_noflock,
3600 .lock = ll_file_noflock
3603 struct inode_operations ll_file_inode_operations = {
3604 .setattr = ll_setattr,
3605 .getattr = ll_getattr,
3606 .permission = ll_inode_permission,
3607 .setxattr = ll_setxattr,
3608 .getxattr = ll_getxattr,
3609 .listxattr = ll_listxattr,
3610 .removexattr = ll_removexattr,
3611 .fiemap = ll_fiemap,
3612 #ifdef HAVE_IOP_GET_ACL
3613 .get_acl = ll_get_acl,
3617 /* dynamic ioctl number support routins */
3618 static struct llioc_ctl_data {
3619 struct rw_semaphore ioc_sem;
3620 struct list_head ioc_head;
3622 __RWSEM_INITIALIZER(llioc.ioc_sem),
3623 LIST_HEAD_INIT(llioc.ioc_head)
3628 struct list_head iocd_list;
3629 unsigned int iocd_size;
3630 llioc_callback_t iocd_cb;
3631 unsigned int iocd_count;
3632 unsigned int iocd_cmd[0];
3635 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3638 struct llioc_data *in_data = NULL;
3641 if (cb == NULL || cmd == NULL ||
3642 count > LLIOC_MAX_CMD || count < 0)
3645 size = sizeof(*in_data) + count * sizeof(unsigned int);
3646 OBD_ALLOC(in_data, size);
3647 if (in_data == NULL)
3650 memset(in_data, 0, sizeof(*in_data));
3651 in_data->iocd_size = size;
3652 in_data->iocd_cb = cb;
3653 in_data->iocd_count = count;
3654 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3656 down_write(&llioc.ioc_sem);
3657 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3658 up_write(&llioc.ioc_sem);
3663 void ll_iocontrol_unregister(void *magic)
3665 struct llioc_data *tmp;
3670 down_write(&llioc.ioc_sem);
3671 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3673 unsigned int size = tmp->iocd_size;
3675 list_del(&tmp->iocd_list);
3676 up_write(&llioc.ioc_sem);
3678 OBD_FREE(tmp, size);
3682 up_write(&llioc.ioc_sem);
3684 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3687 EXPORT_SYMBOL(ll_iocontrol_register);
3688 EXPORT_SYMBOL(ll_iocontrol_unregister);
3690 static enum llioc_iter
3691 ll_iocontrol_call(struct inode *inode, struct file *file,
3692 unsigned int cmd, unsigned long arg, int *rcp)
3694 enum llioc_iter ret = LLIOC_CONT;
3695 struct llioc_data *data;
3696 int rc = -EINVAL, i;
3698 down_read(&llioc.ioc_sem);
3699 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3700 for (i = 0; i < data->iocd_count; i++) {
3701 if (cmd != data->iocd_cmd[i])
3704 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3708 if (ret == LLIOC_STOP)
3711 up_read(&llioc.ioc_sem);
3718 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3720 struct ll_inode_info *lli = ll_i2info(inode);
3721 struct cl_env_nest nest;
3726 if (lli->lli_clob == NULL)
3729 env = cl_env_nested_get(&nest);
3731 RETURN(PTR_ERR(env));
3733 result = cl_conf_set(env, lli->lli_clob, conf);
3734 cl_env_nested_put(&nest, env);
3736 if (conf->coc_opc == OBJECT_CONF_SET) {
3737 struct ldlm_lock *lock = conf->coc_lock;
3739 LASSERT(lock != NULL);
3740 LASSERT(ldlm_has_layout(lock));
3742 struct lustre_md *md = conf->u.coc_md;
3743 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3745 /* it can only be allowed to match after layout is
3746 * applied to inode otherwise false layout would be
3747 * seen. Applying layout shoud happen before dropping
3748 * the intent lock. */
3749 ldlm_lock_allow_match(lock);
3751 lli->lli_has_smd = lsm_has_objects(md->lsm);
3752 if (md->lsm != NULL)
3753 gen = md->lsm->lsm_layout_gen;
3756 DFID ": layout version change: %u -> %u\n",
3757 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3759 ll_layout_version_set(lli, gen);
3765 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3766 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3769 struct ll_sb_info *sbi = ll_i2sbi(inode);
3770 struct obd_capa *oc;
3771 struct ptlrpc_request *req;
3772 struct mdt_body *body;
3779 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3780 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3781 lock->l_lvb_data, lock->l_lvb_len);
3783 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3786 /* if layout lock was granted right away, the layout is returned
3787 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3788 * blocked and then granted via completion ast, we have to fetch
3789 * layout here. Please note that we can't use the LVB buffer in
3790 * completion AST because it doesn't have a large enough buffer */
3791 oc = ll_mdscapa_get(inode);
3792 rc = ll_get_default_mdsize(sbi, &lmmsize);
3794 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3795 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3801 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3803 GOTO(out, rc = -EPROTO);
3805 lmmsize = body->mbo_eadatasize;
3806 if (lmmsize == 0) /* empty layout */
3809 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3811 GOTO(out, rc = -EFAULT);
3813 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3814 if (lvbdata == NULL)
3815 GOTO(out, rc = -ENOMEM);
3817 memcpy(lvbdata, lmm, lmmsize);
3818 lock_res_and_lock(lock);
3819 if (lock->l_lvb_data != NULL)
3820 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3822 lock->l_lvb_data = lvbdata;
3823 lock->l_lvb_len = lmmsize;
3824 unlock_res_and_lock(lock);
3829 ptlrpc_req_finished(req);
3834 * Apply the layout to the inode. Layout lock is held and will be released
3837 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3838 struct inode *inode, __u32 *gen, bool reconf)
3840 struct ll_inode_info *lli = ll_i2info(inode);
3841 struct ll_sb_info *sbi = ll_i2sbi(inode);
3842 struct ldlm_lock *lock;
3843 struct lustre_md md = { NULL };
3844 struct cl_object_conf conf;
3847 bool wait_layout = false;
3850 LASSERT(lustre_handle_is_used(lockh));
3852 lock = ldlm_handle2lock(lockh);
3853 LASSERT(lock != NULL);
3854 LASSERT(ldlm_has_layout(lock));
3856 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3857 PFID(&lli->lli_fid), inode, reconf);
3859 /* in case this is a caching lock and reinstate with new inode */
3860 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3862 lock_res_and_lock(lock);
3863 lvb_ready = ldlm_is_lvb_ready(lock);
3864 unlock_res_and_lock(lock);
3865 /* checking lvb_ready is racy but this is okay. The worst case is
3866 * that multi processes may configure the file on the same time. */
3868 if (lvb_ready || !reconf) {
3871 /* layout_gen must be valid if layout lock is not
3872 * cancelled and stripe has already set */
3873 *gen = ll_layout_version_get(lli);
3879 rc = ll_layout_fetch(inode, lock);
3883 /* for layout lock, lmm is returned in lock's lvb.
3884 * lvb_data is immutable if the lock is held so it's safe to access it
3885 * without res lock. See the description in ldlm_lock_decref_internal()
3886 * for the condition to free lvb_data of layout lock */
3887 if (lock->l_lvb_data != NULL) {
3888 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3889 lock->l_lvb_data, lock->l_lvb_len);
3891 *gen = LL_LAYOUT_GEN_EMPTY;
3893 *gen = md.lsm->lsm_layout_gen;
3896 CERROR("%s: file "DFID" unpackmd error: %d\n",
3897 ll_get_fsname(inode->i_sb, NULL, 0),
3898 PFID(&lli->lli_fid), rc);
3904 /* set layout to file. Unlikely this will fail as old layout was
3905 * surely eliminated */
3906 memset(&conf, 0, sizeof conf);
3907 conf.coc_opc = OBJECT_CONF_SET;
3908 conf.coc_inode = inode;
3909 conf.coc_lock = lock;
3910 conf.u.coc_md = &md;
3911 rc = ll_layout_conf(inode, &conf);
3914 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3916 /* refresh layout failed, need to wait */
3917 wait_layout = rc == -EBUSY;
3921 LDLM_LOCK_PUT(lock);
3922 ldlm_lock_decref(lockh, mode);
3924 /* wait for IO to complete if it's still being used. */
3926 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3927 ll_get_fsname(inode->i_sb, NULL, 0),
3928 PFID(&lli->lli_fid), inode);
3930 memset(&conf, 0, sizeof conf);
3931 conf.coc_opc = OBJECT_CONF_WAIT;
3932 conf.coc_inode = inode;
3933 rc = ll_layout_conf(inode, &conf);
3937 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3938 ll_get_fsname(inode->i_sb, NULL, 0),
3939 PFID(&lli->lli_fid), rc);
3945 * This function checks if there exists a LAYOUT lock on the client side,
3946 * or enqueues it if it doesn't have one in cache.
3948 * This function will not hold layout lock so it may be revoked any time after
3949 * this function returns. Any operations depend on layout should be redone
3952 * This function should be called before lov_io_init() to get an uptodate
3953 * layout version, the caller should save the version number and after IO
3954 * is finished, this function should be called again to verify that layout
3955 * is not changed during IO time.
3957 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3959 struct ll_inode_info *lli = ll_i2info(inode);
3960 struct ll_sb_info *sbi = ll_i2sbi(inode);
3961 struct md_op_data *op_data;
3962 struct lookup_intent it;
3963 struct lustre_handle lockh;
3965 struct ldlm_enqueue_info einfo = {
3966 .ei_type = LDLM_IBITS,
3968 .ei_cb_bl = &ll_md_blocking_ast,
3969 .ei_cb_cp = &ldlm_completion_ast,
3974 *gen = ll_layout_version_get(lli);
3975 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3979 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3980 LASSERT(S_ISREG(inode->i_mode));
3982 /* take layout lock mutex to enqueue layout lock exclusively. */
3983 mutex_lock(&lli->lli_layout_mutex);
3986 /* mostly layout lock is caching on the local side, so try to match
3987 * it before grabbing layout lock mutex. */
3988 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3989 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3990 if (mode != 0) { /* hit cached lock */
3991 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3995 mutex_unlock(&lli->lli_layout_mutex);
3999 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4000 0, 0, LUSTRE_OPC_ANY, NULL);
4001 if (IS_ERR(op_data)) {
4002 mutex_unlock(&lli->lli_layout_mutex);
4003 RETURN(PTR_ERR(op_data));
4006 /* have to enqueue one */
4007 memset(&it, 0, sizeof(it));
4008 it.it_op = IT_LAYOUT;
4009 lockh.cookie = 0ULL;
4011 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4012 ll_get_fsname(inode->i_sb, NULL, 0),
4013 PFID(&lli->lli_fid), inode);
4015 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4016 if (it.d.lustre.it_data != NULL)
4017 ptlrpc_req_finished(it.d.lustre.it_data);
4018 it.d.lustre.it_data = NULL;
4020 ll_finish_md_op_data(op_data);
4022 mode = it.d.lustre.it_lock_mode;
4023 it.d.lustre.it_lock_mode = 0;
4024 ll_intent_drop_lock(&it);
4027 /* set lock data in case this is a new lock */
4028 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4029 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4033 mutex_unlock(&lli->lli_layout_mutex);
4039 * This function send a restore request to the MDT
4041 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4043 struct hsm_user_request *hur;
4047 len = sizeof(struct hsm_user_request) +
4048 sizeof(struct hsm_user_item);
4049 OBD_ALLOC(hur, len);
4053 hur->hur_request.hr_action = HUA_RESTORE;
4054 hur->hur_request.hr_archive_id = 0;
4055 hur->hur_request.hr_flags = 0;
4056 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4057 sizeof(hur->hur_user_item[0].hui_fid));
4058 hur->hur_user_item[0].hui_extent.offset = offset;
4059 hur->hur_user_item[0].hui_extent.length = length;
4060 hur->hur_request.hr_itemcount = 1;
4061 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,