4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
198 rc = ll_objects_destroy(req, inode);
200 CERROR("%s: inode "DFID
201 " ll_objects destroy: rc = %d\n",
202 ll_i2mdexp(inode)->exp_obd->obd_name,
203 PFID(ll_inode2fid(inode)), rc);
206 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
207 struct mdt_body *body;
208 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
209 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
213 ll_finish_md_op_data(op_data);
217 if (exp_connect_som(exp) && !epoch_close &&
218 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
219 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
221 md_clear_open_replay_data(md_exp, och);
222 /* Free @och if it is not waiting for DONE_WRITING. */
223 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
226 if (req) /* This is close request */
227 ptlrpc_req_finished(req);
231 int ll_md_real_close(struct inode *inode, fmode_t fmode)
233 struct ll_inode_info *lli = ll_i2info(inode);
234 struct obd_client_handle **och_p;
235 struct obd_client_handle *och;
240 if (fmode & FMODE_WRITE) {
241 och_p = &lli->lli_mds_write_och;
242 och_usecount = &lli->lli_open_fd_write_count;
243 } else if (fmode & FMODE_EXEC) {
244 och_p = &lli->lli_mds_exec_och;
245 och_usecount = &lli->lli_open_fd_exec_count;
247 LASSERT(fmode & FMODE_READ);
248 och_p = &lli->lli_mds_read_och;
249 och_usecount = &lli->lli_open_fd_read_count;
252 mutex_lock(&lli->lli_och_mutex);
253 if (*och_usecount > 0) {
254 /* There are still users of this handle, so skip
256 mutex_unlock(&lli->lli_och_mutex);
262 mutex_unlock(&lli->lli_och_mutex);
265 /* There might be a race and this handle may already
267 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
274 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
278 struct ll_inode_info *lli = ll_i2info(inode);
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("released file has negative dentry: file = %p, "
337 "dentry = %p, name = %s\n",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead,
385 * because parent and child process can share the same file handle. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
387 ll_deauthorize_statahead(inode, fd);
389 if (inode->i_sb->s_root == file->f_dentry) {
390 LUSTRE_FPRIVATE(file) = NULL;
391 ll_file_data_put(fd);
395 if (!S_ISDIR(inode->i_mode)) {
396 if (lli->lli_clob != NULL)
397 lov_read_and_clear_async_rc(lli->lli_clob);
398 lli->lli_async_rc = 0;
401 rc = ll_md_close(sbi->ll_md_exp, inode, file);
403 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
404 libcfs_debug_dumplog();
409 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
410 struct lookup_intent *itp)
412 struct dentry *de = file->f_dentry;
413 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
414 struct dentry *parent = de->d_parent;
415 const char *name = NULL;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req = NULL;
422 LASSERT(parent != NULL);
423 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
425 /* if server supports open-by-fid, or file name is invalid, don't pack
426 * name in open request */
427 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
428 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
429 name = de->d_name.name;
430 len = de->d_name.len;
433 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
434 name, len, 0, LUSTRE_OPC_ANY, NULL);
436 RETURN(PTR_ERR(op_data));
437 op_data->op_data = lmm;
438 op_data->op_data_size = lmmsize;
440 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
441 &ll_md_blocking_ast, 0);
442 ll_finish_md_op_data(op_data);
444 /* reason for keep own exit path - don`t flood log
445 * with messages with -ESTALE errors.
447 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
448 it_open_error(DISP_OPEN_OPEN, itp))
450 ll_release_openhandle(de, itp);
454 if (it_disposition(itp, DISP_LOOKUP_NEG))
455 GOTO(out, rc = -ENOENT);
457 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
458 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
459 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
463 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
464 if (!rc && itp->d.lustre.it_lock_mode)
465 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
468 ptlrpc_req_finished(req);
469 ll_intent_drop_lock(itp);
475 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
476 * not believe attributes if a few ioepoch holders exist. Attributes for
477 * previous ioepoch if new one is opened are also skipped by MDS.
479 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
481 if (ioepoch && lli->lli_ioepoch != ioepoch) {
482 lli->lli_ioepoch = ioepoch;
483 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
484 ioepoch, PFID(&lli->lli_fid));
488 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
489 struct obd_client_handle *och)
491 struct ptlrpc_request *req = it->d.lustre.it_data;
492 struct mdt_body *body;
494 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
495 och->och_fh = body->mbo_handle;
496 och->och_fid = body->mbo_fid1;
497 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
498 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
499 och->och_flags = it->it_flags;
501 return md_set_open_replay_data(md_exp, och, it);
504 static int ll_local_open(struct file *file, struct lookup_intent *it,
505 struct ll_file_data *fd, struct obd_client_handle *och)
507 struct inode *inode = file->f_dentry->d_inode;
508 struct ll_inode_info *lli = ll_i2info(inode);
511 LASSERT(!LUSTRE_FPRIVATE(file));
516 struct ptlrpc_request *req = it->d.lustre.it_data;
517 struct mdt_body *body;
520 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
524 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
525 ll_ioepoch_open(lli, body->mbo_ioepoch);
528 LUSTRE_FPRIVATE(file) = fd;
529 ll_readahead_init(inode, &fd->fd_ras);
530 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
532 /* ll_cl_context initialize */
533 rwlock_init(&fd->fd_lock);
534 INIT_LIST_HEAD(&fd->fd_lccs);
539 /* Open a file, and (for the very first open) create objects on the OSTs at
540 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
541 * creation or open until ll_lov_setstripe() ioctl is called.
543 * If we already have the stripe MD locally then we don't request it in
544 * md_open(), by passing a lmm_size = 0.
546 * It is up to the application to ensure no other processes open this file
547 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
548 * used. We might be able to avoid races of that sort by getting lli_open_sem
549 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
550 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
552 int ll_file_open(struct inode *inode, struct file *file)
554 struct ll_inode_info *lli = ll_i2info(inode);
555 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
556 .it_flags = file->f_flags };
557 struct obd_client_handle **och_p = NULL;
558 __u64 *och_usecount = NULL;
559 struct ll_file_data *fd;
563 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
564 PFID(ll_inode2fid(inode)), inode, file->f_flags);
566 it = file->private_data; /* XXX: compat macro */
567 file->private_data = NULL; /* prevent ll_local_open assertion */
569 fd = ll_file_data_get();
571 GOTO(out_openerr, rc = -ENOMEM);
574 if (S_ISDIR(inode->i_mode))
575 ll_authorize_statahead(inode, fd);
577 if (inode->i_sb->s_root == file->f_dentry) {
578 LUSTRE_FPRIVATE(file) = fd;
582 if (!it || !it->d.lustre.it_disposition) {
583 /* Convert f_flags into access mode. We cannot use file->f_mode,
584 * because everything but O_ACCMODE mask was stripped from
586 if ((oit.it_flags + 1) & O_ACCMODE)
588 if (file->f_flags & O_TRUNC)
589 oit.it_flags |= FMODE_WRITE;
591 /* kernel only call f_op->open in dentry_open. filp_open calls
592 * dentry_open after call to open_namei that checks permissions.
593 * Only nfsd_open call dentry_open directly without checking
594 * permissions and because of that this code below is safe. */
595 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
596 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
598 /* We do not want O_EXCL here, presumably we opened the file
599 * already? XXX - NFS implications? */
600 oit.it_flags &= ~O_EXCL;
602 /* bug20584, if "it_flags" contains O_CREAT, the file will be
603 * created if necessary, then "IT_CREAT" should be set to keep
604 * consistent with it */
605 if (oit.it_flags & O_CREAT)
606 oit.it_op |= IT_CREAT;
612 /* Let's see if we have file open on MDS already. */
613 if (it->it_flags & FMODE_WRITE) {
614 och_p = &lli->lli_mds_write_och;
615 och_usecount = &lli->lli_open_fd_write_count;
616 } else if (it->it_flags & FMODE_EXEC) {
617 och_p = &lli->lli_mds_exec_och;
618 och_usecount = &lli->lli_open_fd_exec_count;
620 och_p = &lli->lli_mds_read_och;
621 och_usecount = &lli->lli_open_fd_read_count;
624 mutex_lock(&lli->lli_och_mutex);
625 if (*och_p) { /* Open handle is present */
626 if (it_disposition(it, DISP_OPEN_OPEN)) {
627 /* Well, there's extra open request that we do not need,
628 let's close it somehow. This will decref request. */
629 rc = it_open_error(DISP_OPEN_OPEN, it);
631 mutex_unlock(&lli->lli_och_mutex);
632 GOTO(out_openerr, rc);
635 ll_release_openhandle(file->f_dentry, it);
639 rc = ll_local_open(file, it, fd, NULL);
642 mutex_unlock(&lli->lli_och_mutex);
643 GOTO(out_openerr, rc);
646 LASSERT(*och_usecount == 0);
647 if (!it->d.lustre.it_disposition) {
648 /* We cannot just request lock handle now, new ELC code
649 means that one of other OPEN locks for this file
650 could be cancelled, and since blocking ast handler
651 would attempt to grab och_mutex as well, that would
652 result in a deadlock */
653 mutex_unlock(&lli->lli_och_mutex);
655 * Normally called under two situations:
657 * 2. A race/condition on MDS resulting in no open
658 * handle to be returned from LOOKUP|OPEN request,
659 * for example if the target entry was a symlink.
661 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
663 * Always specify MDS_OPEN_BY_FID because we don't want
664 * to get file with different fid.
666 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
667 rc = ll_intent_file_open(file, NULL, 0, it);
669 GOTO(out_openerr, rc);
673 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
675 GOTO(out_och_free, rc = -ENOMEM);
679 /* md_intent_lock() didn't get a request ref if there was an
680 * open error, so don't do cleanup on the request here
682 /* XXX (green): Should not we bail out on any error here, not
683 * just open error? */
684 rc = it_open_error(DISP_OPEN_OPEN, it);
686 GOTO(out_och_free, rc);
688 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
689 "inode %p: disposition %x, status %d\n", inode,
690 it_disposition(it, ~0), it->d.lustre.it_status);
692 rc = ll_local_open(file, it, fd, *och_p);
694 GOTO(out_och_free, rc);
696 mutex_unlock(&lli->lli_och_mutex);
699 /* Must do this outside lli_och_mutex lock to prevent deadlock where
700 different kind of OPEN lock for this same inode gets cancelled
701 by ldlm_cancel_lru */
702 if (!S_ISREG(inode->i_mode))
703 GOTO(out_och_free, rc);
707 if (!lli->lli_has_smd &&
708 (cl_is_lov_delay_create(file->f_flags) ||
709 (file->f_mode & FMODE_WRITE) == 0)) {
710 CDEBUG(D_INODE, "object creation was delayed\n");
711 GOTO(out_och_free, rc);
713 cl_lov_delay_create_clear(&file->f_flags);
714 GOTO(out_och_free, rc);
718 if (och_p && *och_p) {
719 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
720 *och_p = NULL; /* OBD_FREE writes some magic there */
723 mutex_unlock(&lli->lli_och_mutex);
726 if (lli->lli_opendir_key == fd)
727 ll_deauthorize_statahead(inode, fd);
729 ll_file_data_put(fd);
731 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
734 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
735 ptlrpc_req_finished(it->d.lustre.it_data);
736 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
742 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
743 struct ldlm_lock_desc *desc, void *data, int flag)
746 struct lustre_handle lockh;
750 case LDLM_CB_BLOCKING:
751 ldlm_lock2handle(lock, &lockh);
752 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
754 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
758 case LDLM_CB_CANCELING:
766 * Acquire a lease and open the file.
768 static struct obd_client_handle *
769 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
772 struct lookup_intent it = { .it_op = IT_OPEN };
773 struct ll_sb_info *sbi = ll_i2sbi(inode);
774 struct md_op_data *op_data;
775 struct ptlrpc_request *req = NULL;
776 struct lustre_handle old_handle = { 0 };
777 struct obd_client_handle *och = NULL;
782 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
783 RETURN(ERR_PTR(-EINVAL));
786 struct ll_inode_info *lli = ll_i2info(inode);
787 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
788 struct obd_client_handle **och_p;
791 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
792 RETURN(ERR_PTR(-EPERM));
794 /* Get the openhandle of the file */
796 mutex_lock(&lli->lli_och_mutex);
797 if (fd->fd_lease_och != NULL) {
798 mutex_unlock(&lli->lli_och_mutex);
802 if (fd->fd_och == NULL) {
803 if (file->f_mode & FMODE_WRITE) {
804 LASSERT(lli->lli_mds_write_och != NULL);
805 och_p = &lli->lli_mds_write_och;
806 och_usecount = &lli->lli_open_fd_write_count;
808 LASSERT(lli->lli_mds_read_och != NULL);
809 och_p = &lli->lli_mds_read_och;
810 och_usecount = &lli->lli_open_fd_read_count;
812 if (*och_usecount == 1) {
819 mutex_unlock(&lli->lli_och_mutex);
820 if (rc < 0) /* more than 1 opener */
823 LASSERT(fd->fd_och != NULL);
824 old_handle = fd->fd_och->och_fh;
829 RETURN(ERR_PTR(-ENOMEM));
831 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
832 LUSTRE_OPC_ANY, NULL);
834 GOTO(out, rc = PTR_ERR(op_data));
836 /* To tell the MDT this openhandle is from the same owner */
837 op_data->op_handle = old_handle;
839 it.it_flags = fmode | open_flags;
840 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
841 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
842 &ll_md_blocking_lease_ast,
843 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
844 * it can be cancelled which may mislead applications that the lease is
846 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
847 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
848 * doesn't deal with openhandle, so normal openhandle will be leaked. */
849 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
850 ll_finish_md_op_data(op_data);
851 ptlrpc_req_finished(req);
853 GOTO(out_release_it, rc);
855 if (it_disposition(&it, DISP_LOOKUP_NEG))
856 GOTO(out_release_it, rc = -ENOENT);
858 rc = it_open_error(DISP_OPEN_OPEN, &it);
860 GOTO(out_release_it, rc);
862 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
863 ll_och_fill(sbi->ll_md_exp, &it, och);
865 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
866 GOTO(out_close, rc = -EOPNOTSUPP);
868 /* already get lease, handle lease lock */
869 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
870 if (it.d.lustre.it_lock_mode == 0 ||
871 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
872 /* open lock must return for lease */
873 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
874 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
875 it.d.lustre.it_lock_bits);
876 GOTO(out_close, rc = -EPROTO);
879 ll_intent_release(&it);
883 /* Cancel open lock */
884 if (it.d.lustre.it_lock_mode != 0) {
885 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
886 it.d.lustre.it_lock_mode);
887 it.d.lustre.it_lock_mode = 0;
888 och->och_lease_handle.cookie = 0ULL;
890 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
892 CERROR("%s: error closing file "DFID": %d\n",
893 ll_get_fsname(inode->i_sb, NULL, 0),
894 PFID(&ll_i2info(inode)->lli_fid), rc2);
895 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
897 ll_intent_release(&it);
905 * Release lease and close the file.
906 * It will check if the lease has ever broken.
908 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
911 struct ldlm_lock *lock;
912 bool cancelled = true;
916 lock = ldlm_handle2lock(&och->och_lease_handle);
918 lock_res_and_lock(lock);
919 cancelled = ldlm_is_cancel(lock);
920 unlock_res_and_lock(lock);
924 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
925 PFID(&ll_i2info(inode)->lli_fid), cancelled);
928 ldlm_cli_cancel(&och->och_lease_handle, 0);
929 if (lease_broken != NULL)
930 *lease_broken = cancelled;
932 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
937 /* Fills the obdo with the attributes for the lsm */
938 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
939 struct obd_capa *capa, struct obdo *obdo,
940 __u64 ioepoch, int dv_flags)
942 struct ptlrpc_request_set *set;
943 struct obd_info oinfo = { { { 0 } } };
948 LASSERT(lsm != NULL);
952 oinfo.oi_oa->o_oi = lsm->lsm_oi;
953 oinfo.oi_oa->o_mode = S_IFREG;
954 oinfo.oi_oa->o_ioepoch = ioepoch;
955 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
956 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
957 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
958 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
959 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
960 OBD_MD_FLDATAVERSION;
961 oinfo.oi_capa = capa;
962 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
963 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
964 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
965 if (dv_flags & LL_DV_WR_FLUSH)
966 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
969 set = ptlrpc_prep_set();
971 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
974 rc = obd_getattr_async(exp, &oinfo, set);
976 rc = ptlrpc_set_wait(set);
977 ptlrpc_set_destroy(set);
980 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
981 OBD_MD_FLATIME | OBD_MD_FLMTIME |
982 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
983 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
984 if (dv_flags & LL_DV_WR_FLUSH &&
985 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
986 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
993 * Performs the getattr on the inode and updates its fields.
994 * If @sync != 0, perform the getattr under the server-side lock.
996 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
997 __u64 ioepoch, int sync)
999 struct obd_capa *capa = ll_mdscapa_get(inode);
1000 struct lov_stripe_md *lsm;
1004 lsm = ccc_inode_lsm_get(inode);
1005 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1006 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1009 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1011 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1012 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1013 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1014 (unsigned long long)inode->i_blocks,
1015 1UL << inode->i_blkbits);
1017 ccc_inode_lsm_put(inode, lsm);
1021 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1023 struct ll_inode_info *lli = ll_i2info(inode);
1024 struct cl_object *obj = lli->lli_clob;
1025 struct cl_attr *attr = ccc_env_thread_attr(env);
1031 ll_inode_size_lock(inode);
1032 /* merge timestamps the most recently obtained from mds with
1033 timestamps obtained from osts */
1034 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1035 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1036 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1038 lvb.lvb_size = i_size_read(inode);
1039 lvb.lvb_blocks = inode->i_blocks;
1040 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1041 lvb.lvb_atime = LTIME_S(inode->i_atime);
1042 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1044 cl_object_attr_lock(obj);
1045 rc = cl_object_attr_get(env, obj, attr);
1046 cl_object_attr_unlock(obj);
1049 if (lvb.lvb_atime < attr->cat_atime)
1050 lvb.lvb_atime = attr->cat_atime;
1051 if (lvb.lvb_ctime < attr->cat_ctime)
1052 lvb.lvb_ctime = attr->cat_ctime;
1053 if (lvb.lvb_mtime < attr->cat_mtime)
1054 lvb.lvb_mtime = attr->cat_mtime;
1056 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1057 PFID(&lli->lli_fid), attr->cat_size);
1058 cl_isize_write_nolock(inode, attr->cat_size);
1060 inode->i_blocks = attr->cat_blocks;
1062 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1063 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1064 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1066 ll_inode_size_unlock(inode);
1071 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1074 struct obdo obdo = { 0 };
1077 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1079 st->st_size = obdo.o_size;
1080 st->st_blocks = obdo.o_blocks;
1081 st->st_mtime = obdo.o_mtime;
1082 st->st_atime = obdo.o_atime;
1083 st->st_ctime = obdo.o_ctime;
1088 static bool file_is_noatime(const struct file *file)
1090 const struct vfsmount *mnt = file->f_path.mnt;
1091 const struct inode *inode = file->f_path.dentry->d_inode;
1093 /* Adapted from file_accessed() and touch_atime().*/
1094 if (file->f_flags & O_NOATIME)
1097 if (inode->i_flags & S_NOATIME)
1100 if (IS_NOATIME(inode))
1103 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1106 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1109 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1115 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1117 struct inode *inode = file->f_dentry->d_inode;
1119 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1121 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1122 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1123 file->f_flags & O_DIRECT ||
1126 io->ci_obj = ll_i2info(inode)->lli_clob;
1127 io->ci_lockreq = CILR_MAYBE;
1128 if (ll_file_nolock(file)) {
1129 io->ci_lockreq = CILR_NEVER;
1130 io->ci_no_srvlock = 1;
1131 } else if (file->f_flags & O_APPEND) {
1132 io->ci_lockreq = CILR_MANDATORY;
1135 io->ci_noatime = file_is_noatime(file);
1139 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1140 struct file *file, enum cl_io_type iot,
1141 loff_t *ppos, size_t count)
1143 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147 struct range_lock range;
1150 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1151 file->f_dentry->d_name.name, iot, *ppos, count);
1154 io = ccc_env_thread_io(env);
1155 ll_io_init(io, file, iot == CIT_WRITE);
1157 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1158 struct vvp_io *vio = vvp_env_io(env);
1159 struct ccc_io *cio = ccc_env_io(env);
1160 bool range_locked = false;
1162 if (file->f_flags & O_APPEND)
1163 range_lock_init(&range, 0, LUSTRE_EOF);
1165 range_lock_init(&range, *ppos, *ppos + count - 1);
1166 cio->cui_fd = LUSTRE_FPRIVATE(file);
1167 vio->cui_io_subtype = args->via_io_subtype;
1169 switch (vio->cui_io_subtype) {
1171 cio->cui_iov = args->u.normal.via_iov;
1172 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1173 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1174 cio->cui_iocb = args->u.normal.via_iocb;
1175 if ((iot == CIT_WRITE) &&
1176 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1177 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1179 result = range_lock(&lli->lli_write_tree,
1184 range_locked = true;
1186 down_read(&lli->lli_trunc_sem);
1189 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1190 vio->u.splice.cui_flags = args->u.splice.via_flags;
1193 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1197 ll_cl_add(file, env, io);
1198 result = cl_io_loop(env, io);
1199 ll_cl_remove(file, env);
1201 if (args->via_io_subtype == IO_NORMAL)
1202 up_read(&lli->lli_trunc_sem);
1204 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1206 range_unlock(&lli->lli_write_tree, &range);
1209 /* cl_io_rw_init() handled IO */
1210 result = io->ci_result;
1213 if (io->ci_nob > 0) {
1214 result = io->ci_nob;
1215 *ppos = io->u.ci_wr.wr.crw_pos;
1219 cl_io_fini(env, io);
1220 /* If any bit been read/written (result != 0), we just return
1221 * short read/write instead of restart io. */
1222 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1223 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1224 iot == CIT_READ ? "read" : "write",
1225 file->f_dentry->d_name.name, *ppos, count);
1226 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1230 if (iot == CIT_READ) {
1232 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1233 LPROC_LL_READ_BYTES, result);
1234 } else if (iot == CIT_WRITE) {
1236 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1237 LPROC_LL_WRITE_BYTES, result);
1238 fd->fd_write_failed = false;
1239 } else if (result != -ERESTARTSYS) {
1240 fd->fd_write_failed = true;
1243 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1250 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1252 static int ll_file_get_iov_count(const struct iovec *iov,
1253 unsigned long *nr_segs, size_t *count)
1258 for (seg = 0; seg < *nr_segs; seg++) {
1259 const struct iovec *iv = &iov[seg];
1262 * If any segment has a negative length, or the cumulative
1263 * length ever wraps negative then return -EINVAL.
1266 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1268 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1273 cnt -= iv->iov_len; /* This segment is no good */
1280 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1281 unsigned long nr_segs, loff_t pos)
1284 struct vvp_io_args *args;
1290 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1294 env = cl_env_get(&refcheck);
1296 RETURN(PTR_ERR(env));
1298 args = vvp_env_args(env, IO_NORMAL);
1299 args->u.normal.via_iov = (struct iovec *)iov;
1300 args->u.normal.via_nrsegs = nr_segs;
1301 args->u.normal.via_iocb = iocb;
1303 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1304 &iocb->ki_pos, count);
1305 cl_env_put(env, &refcheck);
1309 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1313 struct iovec *local_iov;
1314 struct kiocb *kiocb;
1319 env = cl_env_get(&refcheck);
1321 RETURN(PTR_ERR(env));
1323 local_iov = &vvp_env_info(env)->vti_local_iov;
1324 kiocb = &vvp_env_info(env)->vti_kiocb;
1325 local_iov->iov_base = (void __user *)buf;
1326 local_iov->iov_len = count;
1327 init_sync_kiocb(kiocb, file);
1328 kiocb->ki_pos = *ppos;
1329 #ifdef HAVE_KIOCB_KI_LEFT
1330 kiocb->ki_left = count;
1332 kiocb->ki_nbytes = count;
1335 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1336 *ppos = kiocb->ki_pos;
1338 cl_env_put(env, &refcheck);
1343 * Write to a file (through the page cache).
1346 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1347 unsigned long nr_segs, loff_t pos)
1350 struct vvp_io_args *args;
1356 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1360 env = cl_env_get(&refcheck);
1362 RETURN(PTR_ERR(env));
1364 args = vvp_env_args(env, IO_NORMAL);
1365 args->u.normal.via_iov = (struct iovec *)iov;
1366 args->u.normal.via_nrsegs = nr_segs;
1367 args->u.normal.via_iocb = iocb;
1369 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1370 &iocb->ki_pos, count);
1371 cl_env_put(env, &refcheck);
1375 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1376 size_t count, loff_t *ppos)
1379 struct iovec *local_iov;
1380 struct kiocb *kiocb;
1385 env = cl_env_get(&refcheck);
1387 RETURN(PTR_ERR(env));
1389 local_iov = &vvp_env_info(env)->vti_local_iov;
1390 kiocb = &vvp_env_info(env)->vti_kiocb;
1391 local_iov->iov_base = (void __user *)buf;
1392 local_iov->iov_len = count;
1393 init_sync_kiocb(kiocb, file);
1394 kiocb->ki_pos = *ppos;
1395 #ifdef HAVE_KIOCB_KI_LEFT
1396 kiocb->ki_left = count;
1398 kiocb->ki_nbytes = count;
1401 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1402 *ppos = kiocb->ki_pos;
1404 cl_env_put(env, &refcheck);
1409 * Send file content (through pagecache) somewhere with helper
1411 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1412 struct pipe_inode_info *pipe, size_t count,
1416 struct vvp_io_args *args;
1421 env = cl_env_get(&refcheck);
1423 RETURN(PTR_ERR(env));
1425 args = vvp_env_args(env, IO_SPLICE);
1426 args->u.splice.via_pipe = pipe;
1427 args->u.splice.via_flags = flags;
1429 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1430 cl_env_put(env, &refcheck);
1434 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1435 __u64 flags, struct lov_user_md *lum,
1438 struct lov_stripe_md *lsm = NULL;
1439 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1443 lsm = ccc_inode_lsm_get(inode);
1445 ccc_inode_lsm_put(inode, lsm);
1446 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1447 PFID(ll_inode2fid(inode)));
1448 GOTO(out, rc = -EEXIST);
1451 ll_inode_size_lock(inode);
1452 oit.it_flags |= MDS_OPEN_BY_FID;
1453 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1455 GOTO(out_unlock, rc);
1456 rc = oit.d.lustre.it_status;
1458 GOTO(out_req_free, rc);
1460 ll_release_openhandle(file->f_dentry, &oit);
1463 ll_inode_size_unlock(inode);
1464 ll_intent_release(&oit);
1465 ccc_inode_lsm_put(inode, lsm);
1467 cl_lov_delay_create_clear(&file->f_flags);
1470 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1474 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1475 struct lov_mds_md **lmmp, int *lmm_size,
1476 struct ptlrpc_request **request)
1478 struct ll_sb_info *sbi = ll_i2sbi(inode);
1479 struct mdt_body *body;
1480 struct lov_mds_md *lmm = NULL;
1481 struct ptlrpc_request *req = NULL;
1482 struct md_op_data *op_data;
1485 rc = ll_get_default_mdsize(sbi, &lmmsize);
1489 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1490 strlen(filename), lmmsize,
1491 LUSTRE_OPC_ANY, NULL);
1492 if (IS_ERR(op_data))
1493 RETURN(PTR_ERR(op_data));
1495 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1496 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1497 ll_finish_md_op_data(op_data);
1499 CDEBUG(D_INFO, "md_getattr_name failed "
1500 "on %s: rc %d\n", filename, rc);
1504 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1505 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1507 lmmsize = body->mbo_eadatasize;
1509 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1511 GOTO(out, rc = -ENODATA);
1514 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1515 LASSERT(lmm != NULL);
1517 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1518 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1519 GOTO(out, rc = -EPROTO);
1523 * This is coming from the MDS, so is probably in
1524 * little endian. We convert it to host endian before
1525 * passing it to userspace.
1527 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1530 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1531 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1534 /* if function called for directory - we should
1535 * avoid swab not existent lsm objects */
1536 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1537 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1538 if (S_ISREG(body->mbo_mode))
1539 lustre_swab_lov_user_md_objects(
1540 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1542 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1543 lustre_swab_lov_user_md_v3(
1544 (struct lov_user_md_v3 *)lmm);
1545 if (S_ISREG(body->mbo_mode))
1546 lustre_swab_lov_user_md_objects(
1547 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1554 *lmm_size = lmmsize;
1559 static int ll_lov_setea(struct inode *inode, struct file *file,
1562 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1563 struct lov_user_md *lump;
1564 int lum_size = sizeof(struct lov_user_md) +
1565 sizeof(struct lov_user_ost_data);
1569 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1572 OBD_ALLOC_LARGE(lump, lum_size);
1576 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1577 OBD_FREE_LARGE(lump, lum_size);
1581 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1583 OBD_FREE_LARGE(lump, lum_size);
1587 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1590 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1591 struct lov_user_md *klum;
1593 __u64 flags = FMODE_WRITE;
1596 rc = ll_copy_user_md(lum, &klum);
1601 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1603 struct lov_stripe_md *lsm;
1606 put_user(0, &lum->lmm_stripe_count);
1608 ll_layout_refresh(inode, &gen);
1609 lsm = ccc_inode_lsm_get(inode);
1610 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1612 ccc_inode_lsm_put(inode, lsm);
1615 OBD_FREE(klum, lum_size);
1619 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1621 struct lov_stripe_md *lsm;
1625 lsm = ccc_inode_lsm_get(inode);
1627 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1628 lsm, (void __user *)arg);
1629 ccc_inode_lsm_put(inode, lsm);
1634 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1636 struct ll_inode_info *lli = ll_i2info(inode);
1637 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1638 struct ccc_grouplock grouplock;
1642 if (ll_file_nolock(file))
1643 RETURN(-EOPNOTSUPP);
1645 spin_lock(&lli->lli_lock);
1646 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1647 CWARN("group lock already existed with gid %lu\n",
1648 fd->fd_grouplock.cg_gid);
1649 spin_unlock(&lli->lli_lock);
1652 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1653 spin_unlock(&lli->lli_lock);
1655 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1656 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1660 spin_lock(&lli->lli_lock);
1661 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1662 spin_unlock(&lli->lli_lock);
1663 CERROR("another thread just won the race\n");
1664 cl_put_grouplock(&grouplock);
1668 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1669 fd->fd_grouplock = grouplock;
1670 spin_unlock(&lli->lli_lock);
1672 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1676 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1678 struct ll_inode_info *lli = ll_i2info(inode);
1679 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1680 struct ccc_grouplock grouplock;
1683 spin_lock(&lli->lli_lock);
1684 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1685 spin_unlock(&lli->lli_lock);
1686 CWARN("no group lock held\n");
1689 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1691 if (fd->fd_grouplock.cg_gid != arg) {
1692 CWARN("group lock %lu doesn't match current id %lu\n",
1693 arg, fd->fd_grouplock.cg_gid);
1694 spin_unlock(&lli->lli_lock);
1698 grouplock = fd->fd_grouplock;
1699 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1700 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1701 spin_unlock(&lli->lli_lock);
1703 cl_put_grouplock(&grouplock);
1704 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1709 * Close inode open handle
1711 * \param dentry [in] dentry which contains the inode
1712 * \param it [in,out] intent which contains open info and result
1715 * \retval <0 failure
1717 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1719 struct inode *inode = dentry->d_inode;
1720 struct obd_client_handle *och;
1726 /* Root ? Do nothing. */
1727 if (dentry->d_inode->i_sb->s_root == dentry)
1730 /* No open handle to close? Move away */
1731 if (!it_disposition(it, DISP_OPEN_OPEN))
1734 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1736 OBD_ALLOC(och, sizeof(*och));
1738 GOTO(out, rc = -ENOMEM);
1740 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1742 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1745 /* this one is in place of ll_file_open */
1746 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1747 ptlrpc_req_finished(it->d.lustre.it_data);
1748 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1754 * Get size for inode for which FIEMAP mapping is requested.
1755 * Make the FIEMAP get_info call and returns the result.
1757 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1760 struct obd_export *exp = ll_i2dtexp(inode);
1761 struct lov_stripe_md *lsm = NULL;
1762 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1763 __u32 vallen = num_bytes;
1767 /* Checks for fiemap flags */
1768 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1769 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1773 /* Check for FIEMAP_FLAG_SYNC */
1774 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1775 rc = filemap_fdatawrite(inode->i_mapping);
1780 lsm = ccc_inode_lsm_get(inode);
1784 /* If the stripe_count > 1 and the application does not understand
1785 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1787 if (lsm->lsm_stripe_count > 1 &&
1788 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1789 GOTO(out, rc = -EOPNOTSUPP);
1791 fm_key.oa.o_oi = lsm->lsm_oi;
1792 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1794 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1795 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1796 /* If filesize is 0, then there would be no objects for mapping */
1797 if (fm_key.oa.o_size == 0) {
1798 fiemap->fm_mapped_extents = 0;
1802 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1804 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1807 CERROR("obd_get_info failed: rc = %d\n", rc);
1810 ccc_inode_lsm_put(inode, lsm);
1814 int ll_fid2path(struct inode *inode, void __user *arg)
1816 struct obd_export *exp = ll_i2mdexp(inode);
1817 const struct getinfo_fid2path __user *gfin = arg;
1819 struct getinfo_fid2path *gfout;
1825 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1826 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1829 /* Only need to get the buflen */
1830 if (get_user(pathlen, &gfin->gf_pathlen))
1833 if (pathlen > PATH_MAX)
1836 outsize = sizeof(*gfout) + pathlen;
1837 OBD_ALLOC(gfout, outsize);
1841 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1842 GOTO(gf_free, rc = -EFAULT);
1844 /* Call mdc_iocontrol */
1845 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1849 if (copy_to_user(arg, gfout, outsize))
1853 OBD_FREE(gfout, outsize);
1857 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1859 struct ll_user_fiemap *fiemap_s;
1860 size_t num_bytes, ret_bytes;
1861 unsigned int extent_count;
1864 /* Get the extent count so we can calculate the size of
1865 * required fiemap buffer */
1866 if (get_user(extent_count,
1867 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1871 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1873 num_bytes = sizeof(*fiemap_s) + (extent_count *
1874 sizeof(struct ll_fiemap_extent));
1876 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1877 if (fiemap_s == NULL)
1880 /* get the fiemap value */
1881 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1883 GOTO(error, rc = -EFAULT);
1885 /* If fm_extent_count is non-zero, read the first extent since
1886 * it is used to calculate end_offset and device from previous
1889 if (copy_from_user(&fiemap_s->fm_extents[0],
1890 (char __user *)arg + sizeof(*fiemap_s),
1891 sizeof(struct ll_fiemap_extent)))
1892 GOTO(error, rc = -EFAULT);
1895 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1899 ret_bytes = sizeof(struct ll_user_fiemap);
1901 if (extent_count != 0)
1902 ret_bytes += (fiemap_s->fm_mapped_extents *
1903 sizeof(struct ll_fiemap_extent));
1905 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1909 OBD_FREE_LARGE(fiemap_s, num_bytes);
1914 * Read the data_version for inode.
1916 * This value is computed using stripe object version on OST.
1917 * Version is computed using server side locking.
1919 * @param sync if do sync on the OST side;
1921 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1922 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1924 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1926 struct lov_stripe_md *lsm = NULL;
1927 struct ll_sb_info *sbi = ll_i2sbi(inode);
1928 struct obdo *obdo = NULL;
1932 /* If no stripe, we consider version is 0. */
1933 lsm = ccc_inode_lsm_get(inode);
1934 if (!lsm_has_objects(lsm)) {
1936 CDEBUG(D_INODE, "No object for inode\n");
1940 OBD_ALLOC_PTR(obdo);
1942 GOTO(out, rc = -ENOMEM);
1944 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1946 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1949 *data_version = obdo->o_data_version;
1955 ccc_inode_lsm_put(inode, lsm);
1960 * Trigger a HSM release request for the provided inode.
1962 int ll_hsm_release(struct inode *inode)
1964 struct cl_env_nest nest;
1966 struct obd_client_handle *och = NULL;
1967 __u64 data_version = 0;
1971 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1972 ll_get_fsname(inode->i_sb, NULL, 0),
1973 PFID(&ll_i2info(inode)->lli_fid));
1975 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1977 GOTO(out, rc = PTR_ERR(och));
1979 /* Grab latest data_version and [am]time values */
1980 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1984 env = cl_env_nested_get(&nest);
1986 GOTO(out, rc = PTR_ERR(env));
1988 ll_merge_lvb(env, inode);
1989 cl_env_nested_put(&nest, env);
1991 /* Release the file.
1992 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1993 * we still need it to pack l_remote_handle to MDT. */
1994 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2000 if (och != NULL && !IS_ERR(och)) /* close the file */
2001 ll_lease_close(och, inode, NULL);
2006 struct ll_swap_stack {
2007 struct iattr ia1, ia2;
2009 struct inode *inode1, *inode2;
2010 bool check_dv1, check_dv2;
2013 static int ll_swap_layouts(struct file *file1, struct file *file2,
2014 struct lustre_swap_layouts *lsl)
2016 struct mdc_swap_layouts msl;
2017 struct md_op_data *op_data;
2020 struct ll_swap_stack *llss = NULL;
2023 OBD_ALLOC_PTR(llss);
2027 llss->inode1 = file1->f_dentry->d_inode;
2028 llss->inode2 = file2->f_dentry->d_inode;
2030 if (!S_ISREG(llss->inode2->i_mode))
2031 GOTO(free, rc = -EINVAL);
2033 if (inode_permission(llss->inode1, MAY_WRITE) ||
2034 inode_permission(llss->inode2, MAY_WRITE))
2035 GOTO(free, rc = -EPERM);
2037 if (llss->inode2->i_sb != llss->inode1->i_sb)
2038 GOTO(free, rc = -EXDEV);
2040 /* we use 2 bool because it is easier to swap than 2 bits */
2041 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2042 llss->check_dv1 = true;
2044 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2045 llss->check_dv2 = true;
2047 /* we cannot use lsl->sl_dvX directly because we may swap them */
2048 llss->dv1 = lsl->sl_dv1;
2049 llss->dv2 = lsl->sl_dv2;
2051 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2052 if (rc == 0) /* same file, done! */
2055 if (rc < 0) { /* sequentialize it */
2056 swap(llss->inode1, llss->inode2);
2058 swap(llss->dv1, llss->dv2);
2059 swap(llss->check_dv1, llss->check_dv2);
2063 if (gid != 0) { /* application asks to flush dirty cache */
2064 rc = ll_get_grouplock(llss->inode1, file1, gid);
2068 rc = ll_get_grouplock(llss->inode2, file2, gid);
2070 ll_put_grouplock(llss->inode1, file1, gid);
2075 /* to be able to restore mtime and atime after swap
2076 * we need to first save them */
2078 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2079 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2080 llss->ia1.ia_atime = llss->inode1->i_atime;
2081 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2082 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2083 llss->ia2.ia_atime = llss->inode2->i_atime;
2084 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2087 /* ultimate check, before swaping the layouts we check if
2088 * dataversion has changed (if requested) */
2089 if (llss->check_dv1) {
2090 rc = ll_data_version(llss->inode1, &dv, 0);
2093 if (dv != llss->dv1)
2094 GOTO(putgl, rc = -EAGAIN);
2097 if (llss->check_dv2) {
2098 rc = ll_data_version(llss->inode2, &dv, 0);
2101 if (dv != llss->dv2)
2102 GOTO(putgl, rc = -EAGAIN);
2105 /* struct md_op_data is used to send the swap args to the mdt
2106 * only flags is missing, so we use struct mdc_swap_layouts
2107 * through the md_op_data->op_data */
2108 /* flags from user space have to be converted before they are send to
2109 * server, no flag is sent today, they are only used on the client */
2112 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2113 0, LUSTRE_OPC_ANY, &msl);
2114 if (IS_ERR(op_data))
2115 GOTO(free, rc = PTR_ERR(op_data));
2117 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2118 sizeof(*op_data), op_data, NULL);
2119 ll_finish_md_op_data(op_data);
2123 ll_put_grouplock(llss->inode2, file2, gid);
2124 ll_put_grouplock(llss->inode1, file1, gid);
2127 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2131 /* clear useless flags */
2132 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2133 llss->ia1.ia_valid &= ~ATTR_MTIME;
2134 llss->ia2.ia_valid &= ~ATTR_MTIME;
2137 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2138 llss->ia1.ia_valid &= ~ATTR_ATIME;
2139 llss->ia2.ia_valid &= ~ATTR_ATIME;
2142 /* update time if requested */
2144 if (llss->ia2.ia_valid != 0) {
2145 mutex_lock(&llss->inode1->i_mutex);
2146 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2147 mutex_unlock(&llss->inode1->i_mutex);
2150 if (llss->ia1.ia_valid != 0) {
2153 mutex_lock(&llss->inode2->i_mutex);
2154 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2155 mutex_unlock(&llss->inode2->i_mutex);
2167 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2169 struct md_op_data *op_data;
2172 /* Non-root users are forbidden to set or clear flags which are
2173 * NOT defined in HSM_USER_MASK. */
2174 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2175 !cfs_capable(CFS_CAP_SYS_ADMIN))
2178 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2179 LUSTRE_OPC_ANY, hss);
2180 if (IS_ERR(op_data))
2181 RETURN(PTR_ERR(op_data));
2183 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2184 sizeof(*op_data), op_data, NULL);
2186 ll_finish_md_op_data(op_data);
2191 static int ll_hsm_import(struct inode *inode, struct file *file,
2192 struct hsm_user_import *hui)
2194 struct hsm_state_set *hss = NULL;
2195 struct iattr *attr = NULL;
2199 if (!S_ISREG(inode->i_mode))
2205 GOTO(out, rc = -ENOMEM);
2207 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2208 hss->hss_archive_id = hui->hui_archive_id;
2209 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2210 rc = ll_hsm_state_set(inode, hss);
2214 OBD_ALLOC_PTR(attr);
2216 GOTO(out, rc = -ENOMEM);
2218 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2219 attr->ia_mode |= S_IFREG;
2220 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2221 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2222 attr->ia_size = hui->hui_size;
2223 attr->ia_mtime.tv_sec = hui->hui_mtime;
2224 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2225 attr->ia_atime.tv_sec = hui->hui_atime;
2226 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2228 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2229 ATTR_UID | ATTR_GID |
2230 ATTR_MTIME | ATTR_MTIME_SET |
2231 ATTR_ATIME | ATTR_ATIME_SET;
2233 mutex_lock(&inode->i_mutex);
2235 rc = ll_setattr_raw(file->f_dentry, attr, true);
2239 mutex_unlock(&inode->i_mutex);
2251 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2253 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2254 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2258 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2260 struct inode *inode = file->f_dentry->d_inode;
2261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2265 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2266 PFID(ll_inode2fid(inode)), inode, cmd);
2267 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2269 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2270 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2274 case LL_IOC_GETFLAGS:
2275 /* Get the current value of the file flags */
2276 return put_user(fd->fd_flags, (int __user *)arg);
2277 case LL_IOC_SETFLAGS:
2278 case LL_IOC_CLRFLAGS:
2279 /* Set or clear specific file flags */
2280 /* XXX This probably needs checks to ensure the flags are
2281 * not abused, and to handle any flag side effects.
2283 if (get_user(flags, (int __user *) arg))
2286 if (cmd == LL_IOC_SETFLAGS) {
2287 if ((flags & LL_FILE_IGNORE_LOCK) &&
2288 !(file->f_flags & O_DIRECT)) {
2289 CERROR("%s: unable to disable locking on "
2290 "non-O_DIRECT file\n", current->comm);
2294 fd->fd_flags |= flags;
2296 fd->fd_flags &= ~flags;
2299 case LL_IOC_LOV_SETSTRIPE:
2300 RETURN(ll_lov_setstripe(inode, file, arg));
2301 case LL_IOC_LOV_SETEA:
2302 RETURN(ll_lov_setea(inode, file, arg));
2303 case LL_IOC_LOV_SWAP_LAYOUTS: {
2305 struct lustre_swap_layouts lsl;
2307 if (copy_from_user(&lsl, (char __user *)arg,
2308 sizeof(struct lustre_swap_layouts)))
2311 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2314 file2 = fget(lsl.sl_fd);
2319 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2320 rc = ll_swap_layouts(file, file2, &lsl);
2324 case LL_IOC_LOV_GETSTRIPE:
2325 RETURN(ll_lov_getstripe(inode, arg));
2326 case FSFILT_IOC_FIEMAP:
2327 RETURN(ll_ioctl_fiemap(inode, arg));
2328 case FSFILT_IOC_GETFLAGS:
2329 case FSFILT_IOC_SETFLAGS:
2330 RETURN(ll_iocontrol(inode, file, cmd, arg));
2331 case FSFILT_IOC_GETVERSION_OLD:
2332 case FSFILT_IOC_GETVERSION:
2333 RETURN(put_user(inode->i_generation, (int __user *)arg));
2334 case LL_IOC_GROUP_LOCK:
2335 RETURN(ll_get_grouplock(inode, file, arg));
2336 case LL_IOC_GROUP_UNLOCK:
2337 RETURN(ll_put_grouplock(inode, file, arg));
2338 case IOC_OBD_STATFS:
2339 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2341 /* We need to special case any other ioctls we want to handle,
2342 * to send them to the MDS/OST as appropriate and to properly
2343 * network encode the arg field.
2344 case FSFILT_IOC_SETVERSION_OLD:
2345 case FSFILT_IOC_SETVERSION:
2347 case LL_IOC_FLUSHCTX:
2348 RETURN(ll_flush_ctx(inode));
2349 case LL_IOC_PATH2FID: {
2350 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2351 sizeof(struct lu_fid)))
2356 case LL_IOC_GETPARENT:
2357 RETURN(ll_getparent(file, (void __user *)arg));
2359 case OBD_IOC_FID2PATH:
2360 RETURN(ll_fid2path(inode, (void __user *)arg));
2361 case LL_IOC_DATA_VERSION: {
2362 struct ioc_data_version idv;
2365 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2368 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2369 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2372 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2378 case LL_IOC_GET_MDTIDX: {
2381 mdtidx = ll_get_mdt_idx(inode);
2385 if (put_user((int)mdtidx, (int __user *)arg))
2390 case OBD_IOC_GETDTNAME:
2391 case OBD_IOC_GETMDNAME:
2392 RETURN(ll_get_obd_name(inode, cmd, arg));
2393 case LL_IOC_HSM_STATE_GET: {
2394 struct md_op_data *op_data;
2395 struct hsm_user_state *hus;
2402 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2403 LUSTRE_OPC_ANY, hus);
2404 if (IS_ERR(op_data)) {
2406 RETURN(PTR_ERR(op_data));
2409 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2412 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2415 ll_finish_md_op_data(op_data);
2419 case LL_IOC_HSM_STATE_SET: {
2420 struct hsm_state_set *hss;
2427 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2432 rc = ll_hsm_state_set(inode, hss);
2437 case LL_IOC_HSM_ACTION: {
2438 struct md_op_data *op_data;
2439 struct hsm_current_action *hca;
2446 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2447 LUSTRE_OPC_ANY, hca);
2448 if (IS_ERR(op_data)) {
2450 RETURN(PTR_ERR(op_data));
2453 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2456 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2459 ll_finish_md_op_data(op_data);
2463 case LL_IOC_SET_LEASE: {
2464 struct ll_inode_info *lli = ll_i2info(inode);
2465 struct obd_client_handle *och = NULL;
2470 case LL_LEASE_WRLCK:
2471 if (!(file->f_mode & FMODE_WRITE))
2473 fmode = FMODE_WRITE;
2475 case LL_LEASE_RDLCK:
2476 if (!(file->f_mode & FMODE_READ))
2480 case LL_LEASE_UNLCK:
2481 mutex_lock(&lli->lli_och_mutex);
2482 if (fd->fd_lease_och != NULL) {
2483 och = fd->fd_lease_och;
2484 fd->fd_lease_och = NULL;
2486 mutex_unlock(&lli->lli_och_mutex);
2491 fmode = och->och_flags;
2492 rc = ll_lease_close(och, inode, &lease_broken);
2499 RETURN(ll_lease_type_from_fmode(fmode));
2504 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2506 /* apply for lease */
2507 och = ll_lease_open(inode, file, fmode, 0);
2509 RETURN(PTR_ERR(och));
2512 mutex_lock(&lli->lli_och_mutex);
2513 if (fd->fd_lease_och == NULL) {
2514 fd->fd_lease_och = och;
2517 mutex_unlock(&lli->lli_och_mutex);
2519 /* impossible now that only excl is supported for now */
2520 ll_lease_close(och, inode, &lease_broken);
2525 case LL_IOC_GET_LEASE: {
2526 struct ll_inode_info *lli = ll_i2info(inode);
2527 struct ldlm_lock *lock = NULL;
2530 mutex_lock(&lli->lli_och_mutex);
2531 if (fd->fd_lease_och != NULL) {
2532 struct obd_client_handle *och = fd->fd_lease_och;
2534 lock = ldlm_handle2lock(&och->och_lease_handle);
2536 lock_res_and_lock(lock);
2537 if (!ldlm_is_cancel(lock))
2538 fmode = och->och_flags;
2540 unlock_res_and_lock(lock);
2541 LDLM_LOCK_PUT(lock);
2544 mutex_unlock(&lli->lli_och_mutex);
2546 RETURN(ll_lease_type_from_fmode(fmode));
2548 case LL_IOC_HSM_IMPORT: {
2549 struct hsm_user_import *hui;
2555 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2560 rc = ll_hsm_import(inode, file, hui);
2570 ll_iocontrol_call(inode, file, cmd, arg, &err))
2573 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2574 (void __user *)arg));
2579 #ifndef HAVE_FILE_LLSEEK_SIZE
2580 static inline loff_t
2581 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2583 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2585 if (offset > maxsize)
2588 if (offset != file->f_pos) {
2589 file->f_pos = offset;
2590 file->f_version = 0;
2596 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2597 loff_t maxsize, loff_t eof)
2599 struct inode *inode = file->f_dentry->d_inode;
2607 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2608 * position-querying operation. Avoid rewriting the "same"
2609 * f_pos value back to the file because a concurrent read(),
2610 * write() or lseek() might have altered it
2615 * f_lock protects against read/modify/write race with other
2616 * SEEK_CURs. Note that parallel writes and reads behave
2619 mutex_lock(&inode->i_mutex);
2620 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2621 mutex_unlock(&inode->i_mutex);
2625 * In the generic case the entire file is data, so as long as
2626 * offset isn't at the end of the file then the offset is data.
2633 * There is a virtual hole at the end of the file, so as long as
2634 * offset isn't i_size or larger, return i_size.
2642 return llseek_execute(file, offset, maxsize);
2646 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2648 struct inode *inode = file->f_dentry->d_inode;
2649 loff_t retval, eof = 0;
2652 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2653 (origin == SEEK_CUR) ? file->f_pos : 0);
2654 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2655 PFID(ll_inode2fid(inode)), inode, retval, retval,
2657 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2659 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2660 retval = ll_glimpse_size(inode);
2663 eof = i_size_read(inode);
2666 retval = ll_generic_file_llseek_size(file, offset, origin,
2667 ll_file_maxbytes(inode), eof);
2671 static int ll_flush(struct file *file, fl_owner_t id)
2673 struct inode *inode = file->f_dentry->d_inode;
2674 struct ll_inode_info *lli = ll_i2info(inode);
2675 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2678 LASSERT(!S_ISDIR(inode->i_mode));
2680 /* catch async errors that were recorded back when async writeback
2681 * failed for pages in this mapping. */
2682 rc = lli->lli_async_rc;
2683 lli->lli_async_rc = 0;
2684 if (lli->lli_clob != NULL) {
2685 err = lov_read_and_clear_async_rc(lli->lli_clob);
2690 /* The application has been told write failure already.
2691 * Do not report failure again. */
2692 if (fd->fd_write_failed)
2694 return rc ? -EIO : 0;
2698 * Called to make sure a portion of file has been written out.
2699 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2701 * Return how many pages have been written.
2703 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2704 enum cl_fsync_mode mode, int ignore_layout)
2706 struct cl_env_nest nest;
2709 struct obd_capa *capa = NULL;
2710 struct cl_fsync_io *fio;
2714 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2715 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2718 env = cl_env_nested_get(&nest);
2720 RETURN(PTR_ERR(env));
2722 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2724 io = ccc_env_thread_io(env);
2725 io->ci_obj = cl_i2info(inode)->lli_clob;
2726 io->ci_ignore_layout = ignore_layout;
2728 /* initialize parameters for sync */
2729 fio = &io->u.ci_fsync;
2730 fio->fi_capa = capa;
2731 fio->fi_start = start;
2733 fio->fi_fid = ll_inode2fid(inode);
2734 fio->fi_mode = mode;
2735 fio->fi_nr_written = 0;
2737 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2738 result = cl_io_loop(env, io);
2740 result = io->ci_result;
2742 result = fio->fi_nr_written;
2743 cl_io_fini(env, io);
2744 cl_env_nested_put(&nest, env);
2752 * When dentry is provided (the 'else' case), *file->f_dentry may be
2753 * null and dentry must be used directly rather than pulled from
2754 * *file->f_dentry as is done otherwise.
2757 #ifdef HAVE_FILE_FSYNC_4ARGS
2758 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2760 struct dentry *dentry = file->f_dentry;
2761 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2762 int ll_fsync(struct file *file, int datasync)
2764 struct dentry *dentry = file->f_dentry;
2766 loff_t end = LLONG_MAX;
2768 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2771 loff_t end = LLONG_MAX;
2773 struct inode *inode = dentry->d_inode;
2774 struct ll_inode_info *lli = ll_i2info(inode);
2775 struct ptlrpc_request *req;
2776 struct obd_capa *oc;
2780 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2781 PFID(ll_inode2fid(inode)), inode);
2782 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2784 #ifdef HAVE_FILE_FSYNC_4ARGS
2785 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2786 mutex_lock(&inode->i_mutex);
2788 /* fsync's caller has already called _fdata{sync,write}, we want
2789 * that IO to finish before calling the osc and mdc sync methods */
2790 rc = filemap_fdatawait(inode->i_mapping);
2793 /* catch async errors that were recorded back when async writeback
2794 * failed for pages in this mapping. */
2795 if (!S_ISDIR(inode->i_mode)) {
2796 err = lli->lli_async_rc;
2797 lli->lli_async_rc = 0;
2800 err = lov_read_and_clear_async_rc(lli->lli_clob);
2805 oc = ll_mdscapa_get(inode);
2806 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2812 ptlrpc_req_finished(req);
2814 if (S_ISREG(inode->i_mode)) {
2815 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2817 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2818 if (rc == 0 && err < 0)
2821 fd->fd_write_failed = true;
2823 fd->fd_write_failed = false;
2826 #ifdef HAVE_FILE_FSYNC_4ARGS
2827 mutex_unlock(&inode->i_mutex);
2833 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2835 struct inode *inode = file->f_dentry->d_inode;
2836 struct ll_sb_info *sbi = ll_i2sbi(inode);
2837 struct ldlm_enqueue_info einfo = {
2838 .ei_type = LDLM_FLOCK,
2839 .ei_cb_cp = ldlm_flock_completion_ast,
2840 .ei_cbdata = file_lock,
2842 struct md_op_data *op_data;
2843 struct lustre_handle lockh = {0};
2844 ldlm_policy_data_t flock = {{0}};
2845 int fl_type = file_lock->fl_type;
2851 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2852 PFID(ll_inode2fid(inode)), file_lock);
2854 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2856 if (file_lock->fl_flags & FL_FLOCK) {
2857 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2858 /* flocks are whole-file locks */
2859 flock.l_flock.end = OFFSET_MAX;
2860 /* For flocks owner is determined by the local file desctiptor*/
2861 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2862 } else if (file_lock->fl_flags & FL_POSIX) {
2863 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2864 flock.l_flock.start = file_lock->fl_start;
2865 flock.l_flock.end = file_lock->fl_end;
2869 flock.l_flock.pid = file_lock->fl_pid;
2871 /* Somewhat ugly workaround for svc lockd.
2872 * lockd installs custom fl_lmops->lm_compare_owner that checks
2873 * for the fl_owner to be the same (which it always is on local node
2874 * I guess between lockd processes) and then compares pid.
2875 * As such we assign pid to the owner field to make it all work,
2876 * conflict with normal locks is unlikely since pid space and
2877 * pointer space for current->files are not intersecting */
2878 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2879 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2883 einfo.ei_mode = LCK_PR;
2886 /* An unlock request may or may not have any relation to
2887 * existing locks so we may not be able to pass a lock handle
2888 * via a normal ldlm_lock_cancel() request. The request may even
2889 * unlock a byte range in the middle of an existing lock. In
2890 * order to process an unlock request we need all of the same
2891 * information that is given with a normal read or write record
2892 * lock request. To avoid creating another ldlm unlock (cancel)
2893 * message we'll treat a LCK_NL flock request as an unlock. */
2894 einfo.ei_mode = LCK_NL;
2897 einfo.ei_mode = LCK_PW;
2900 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2915 flags = LDLM_FL_BLOCK_NOWAIT;
2921 flags = LDLM_FL_TEST_LOCK;
2924 CERROR("unknown fcntl lock command: %d\n", cmd);
2928 /* Save the old mode so that if the mode in the lock changes we
2929 * can decrement the appropriate reader or writer refcount. */
2930 file_lock->fl_type = einfo.ei_mode;
2932 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2933 LUSTRE_OPC_ANY, NULL);
2934 if (IS_ERR(op_data))
2935 RETURN(PTR_ERR(op_data));
2937 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2938 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2939 flock.l_flock.pid, flags, einfo.ei_mode,
2940 flock.l_flock.start, flock.l_flock.end);
2942 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2945 /* Restore the file lock type if not TEST lock. */
2946 if (!(flags & LDLM_FL_TEST_LOCK))
2947 file_lock->fl_type = fl_type;
2949 if ((file_lock->fl_flags & FL_FLOCK) &&
2950 (rc == 0 || file_lock->fl_type == F_UNLCK))
2951 rc2 = flock_lock_file_wait(file, file_lock);
2952 if ((file_lock->fl_flags & FL_POSIX) &&
2953 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2954 !(flags & LDLM_FL_TEST_LOCK))
2955 rc2 = posix_lock_file_wait(file, file_lock);
2957 if (rc2 && file_lock->fl_type != F_UNLCK) {
2958 einfo.ei_mode = LCK_NL;
2959 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2964 ll_finish_md_op_data(op_data);
2969 int ll_get_fid_by_name(struct inode *parent, const char *name,
2970 int namelen, struct lu_fid *fid)
2972 struct md_op_data *op_data = NULL;
2973 struct mdt_body *body;
2974 struct ptlrpc_request *req;
2978 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2979 LUSTRE_OPC_ANY, NULL);
2980 if (IS_ERR(op_data))
2981 RETURN(PTR_ERR(op_data));
2983 op_data->op_valid = OBD_MD_FLID;
2984 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2985 ll_finish_md_op_data(op_data);
2989 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2991 GOTO(out_req, rc = -EFAULT);
2993 *fid = body->mbo_fid1;
2995 ptlrpc_req_finished(req);
2999 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3000 const char *name, int namelen)
3002 struct dentry *dchild = NULL;
3003 struct inode *child_inode = NULL;
3004 struct md_op_data *op_data;
3005 struct ptlrpc_request *request = NULL;
3010 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3011 name, PFID(ll_inode2fid(parent)), mdtidx);
3013 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3014 0, LUSTRE_OPC_ANY, NULL);
3015 if (IS_ERR(op_data))
3016 RETURN(PTR_ERR(op_data));
3018 /* Get child FID first */
3019 qstr.hash = full_name_hash(name, namelen);
3022 dchild = d_lookup(file->f_dentry, &qstr);
3023 if (dchild != NULL && dchild->d_inode != NULL) {
3024 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3025 if (dchild->d_inode != NULL) {
3026 child_inode = igrab(dchild->d_inode);
3027 ll_invalidate_aliases(child_inode);
3031 rc = ll_get_fid_by_name(parent, name, namelen,
3037 if (!fid_is_sane(&op_data->op_fid3)) {
3038 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3039 ll_get_fsname(parent->i_sb, NULL, 0), name,
3040 PFID(&op_data->op_fid3));
3041 GOTO(out_free, rc = -EINVAL);
3044 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3049 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3050 PFID(&op_data->op_fid3), mdtidx);
3051 GOTO(out_free, rc = 0);
3054 op_data->op_mds = mdtidx;
3055 op_data->op_cli_flags = CLI_MIGRATE;
3056 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3057 namelen, name, namelen, &request);
3059 ll_update_times(request, parent);
3061 ptlrpc_req_finished(request);
3066 if (child_inode != NULL) {
3067 clear_nlink(child_inode);
3071 ll_finish_md_op_data(op_data);
3076 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3084 * test if some locks matching bits and l_req_mode are acquired
3085 * - bits can be in different locks
3086 * - if found clear the common lock bits in *bits
3087 * - the bits not found, are kept in *bits
3089 * \param bits [IN] searched lock bits [IN]
3090 * \param l_req_mode [IN] searched lock mode
3091 * \retval boolean, true iff all bits are found
3093 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3095 struct lustre_handle lockh;
3096 ldlm_policy_data_t policy;
3097 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3098 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3107 fid = &ll_i2info(inode)->lli_fid;
3108 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3109 ldlm_lockname[mode]);
3111 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3112 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3113 policy.l_inodebits.bits = *bits & (1 << i);
3114 if (policy.l_inodebits.bits == 0)
3117 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3118 &policy, mode, &lockh)) {
3119 struct ldlm_lock *lock;
3121 lock = ldlm_handle2lock(&lockh);
3124 ~(lock->l_policy_data.l_inodebits.bits);
3125 LDLM_LOCK_PUT(lock);
3127 *bits &= ~policy.l_inodebits.bits;
3134 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3135 struct lustre_handle *lockh, __u64 flags,
3138 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3143 fid = &ll_i2info(inode)->lli_fid;
3144 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3146 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3147 fid, LDLM_IBITS, &policy, mode, lockh);
3152 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3154 /* Already unlinked. Just update nlink and return success */
3155 if (rc == -ENOENT) {
3157 /* This path cannot be hit for regular files unless in
3158 * case of obscure races, so no need to to validate
3160 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3162 } else if (rc != 0) {
3163 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3164 "%s: revalidate FID "DFID" error: rc = %d\n",
3165 ll_get_fsname(inode->i_sb, NULL, 0),
3166 PFID(ll_inode2fid(inode)), rc);
3172 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3174 struct inode *inode = dentry->d_inode;
3175 struct ptlrpc_request *req = NULL;
3176 struct obd_export *exp;
3180 LASSERT(inode != NULL);
3182 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3183 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3185 exp = ll_i2mdexp(inode);
3187 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3188 * But under CMD case, it caused some lock issues, should be fixed
3189 * with new CMD ibits lock. See bug 12718 */
3190 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3191 struct lookup_intent oit = { .it_op = IT_GETATTR };
3192 struct md_op_data *op_data;
3194 if (ibits == MDS_INODELOCK_LOOKUP)
3195 oit.it_op = IT_LOOKUP;
3197 /* Call getattr by fid, so do not provide name at all. */
3198 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3199 dentry->d_inode, NULL, 0, 0,
3200 LUSTRE_OPC_ANY, NULL);
3201 if (IS_ERR(op_data))
3202 RETURN(PTR_ERR(op_data));
3204 rc = md_intent_lock(exp, op_data, &oit, &req,
3205 &ll_md_blocking_ast, 0);
3206 ll_finish_md_op_data(op_data);
3208 rc = ll_inode_revalidate_fini(inode, rc);
3212 rc = ll_revalidate_it_finish(req, &oit, dentry);
3214 ll_intent_release(&oit);
3218 /* Unlinked? Unhash dentry, so it is not picked up later by
3219 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3220 here to preserve get_cwd functionality on 2.6.
3222 if (!dentry->d_inode->i_nlink)
3223 d_lustre_invalidate(dentry, 0);
3225 ll_lookup_finish_locks(&oit, dentry);
3226 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3227 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3228 obd_valid valid = OBD_MD_FLGETATTR;
3229 struct md_op_data *op_data;
3232 if (S_ISREG(inode->i_mode)) {
3233 rc = ll_get_default_mdsize(sbi, &ealen);
3236 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3239 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3240 0, ealen, LUSTRE_OPC_ANY,
3242 if (IS_ERR(op_data))
3243 RETURN(PTR_ERR(op_data));
3245 op_data->op_valid = valid;
3246 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3247 * capa for this inode. Because we only keep capas of dirs
3249 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3250 ll_finish_md_op_data(op_data);
3252 rc = ll_inode_revalidate_fini(inode, rc);
3256 rc = ll_prep_inode(&inode, req, NULL, NULL);
3259 ptlrpc_req_finished(req);
3263 static int ll_merge_md_attr(struct inode *inode)
3265 struct cl_attr attr = { 0 };
3268 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3269 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3274 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3275 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3277 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3278 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3279 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3285 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3287 struct inode *inode = dentry->d_inode;
3291 rc = __ll_inode_revalidate(dentry, ibits);
3295 /* if object isn't regular file, don't validate size */
3296 if (!S_ISREG(inode->i_mode)) {
3297 if (S_ISDIR(inode->i_mode) &&
3298 ll_i2info(inode)->lli_lsm_md != NULL) {
3299 rc = ll_merge_md_attr(inode);
3304 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3305 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3306 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3308 /* In case of restore, the MDT has the right size and has
3309 * already send it back without granting the layout lock,
3310 * inode is up-to-date so glimpse is useless.
3311 * Also to glimpse we need the layout, in case of a running
3312 * restore the MDT holds the layout lock so the glimpse will
3313 * block up to the end of restore (getattr will block)
3315 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3316 rc = ll_glimpse_size(inode);
3321 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3323 struct inode *inode = de->d_inode;
3324 struct ll_sb_info *sbi = ll_i2sbi(inode);
3325 struct ll_inode_info *lli = ll_i2info(inode);
3328 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3329 MDS_INODELOCK_LOOKUP);
3330 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3335 stat->dev = inode->i_sb->s_dev;
3336 if (ll_need_32bit_api(sbi))
3337 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3339 stat->ino = inode->i_ino;
3340 stat->mode = inode->i_mode;
3341 stat->uid = inode->i_uid;
3342 stat->gid = inode->i_gid;
3343 stat->rdev = inode->i_rdev;
3344 stat->atime = inode->i_atime;
3345 stat->mtime = inode->i_mtime;
3346 stat->ctime = inode->i_ctime;
3347 stat->blksize = 1 << inode->i_blkbits;
3348 stat->blocks = inode->i_blocks;
3350 if (S_ISDIR(inode->i_mode) &&
3351 ll_i2info(inode)->lli_lsm_md != NULL) {
3352 stat->nlink = lli->lli_stripe_dir_nlink;
3353 stat->size = lli->lli_stripe_dir_size;
3355 stat->nlink = inode->i_nlink;
3356 stat->size = i_size_read(inode);
3362 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3363 __u64 start, __u64 len)
3367 struct ll_user_fiemap *fiemap;
3368 unsigned int extent_count = fieinfo->fi_extents_max;
3370 num_bytes = sizeof(*fiemap) + (extent_count *
3371 sizeof(struct ll_fiemap_extent));
3372 OBD_ALLOC_LARGE(fiemap, num_bytes);
3377 fiemap->fm_flags = fieinfo->fi_flags;
3378 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3379 fiemap->fm_start = start;
3380 fiemap->fm_length = len;
3381 if (extent_count > 0)
3382 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3383 sizeof(struct ll_fiemap_extent));
3385 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3387 fieinfo->fi_flags = fiemap->fm_flags;
3388 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3389 if (extent_count > 0)
3390 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3391 fiemap->fm_mapped_extents *
3392 sizeof(struct ll_fiemap_extent));
3394 OBD_FREE_LARGE(fiemap, num_bytes);
3398 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3400 struct ll_inode_info *lli = ll_i2info(inode);
3401 struct posix_acl *acl = NULL;
3404 spin_lock(&lli->lli_lock);
3405 /* VFS' acl_permission_check->check_acl will release the refcount */
3406 acl = posix_acl_dup(lli->lli_posix_acl);
3407 spin_unlock(&lli->lli_lock);
3412 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3414 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3415 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3417 ll_check_acl(struct inode *inode, int mask)
3420 # ifdef CONFIG_FS_POSIX_ACL
3421 struct posix_acl *acl;
3425 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3426 if (flags & IPERM_FLAG_RCU)
3429 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3434 rc = posix_acl_permission(inode, acl, mask);
3435 posix_acl_release(acl);
3438 # else /* !CONFIG_FS_POSIX_ACL */
3440 # endif /* CONFIG_FS_POSIX_ACL */
3442 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3444 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3445 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3447 # ifdef HAVE_INODE_PERMISION_2ARGS
3448 int ll_inode_permission(struct inode *inode, int mask)
3450 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3455 struct ll_sb_info *sbi;
3456 struct root_squash_info *squash;
3457 struct cred *cred = NULL;
3458 const struct cred *old_cred = NULL;
3460 bool squash_id = false;
3463 #ifdef MAY_NOT_BLOCK
3464 if (mask & MAY_NOT_BLOCK)
3466 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3467 if (flags & IPERM_FLAG_RCU)
3471 /* as root inode are NOT getting validated in lookup operation,
3472 * need to do it before permission check. */
3474 if (inode == inode->i_sb->s_root->d_inode) {
3475 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3476 MDS_INODELOCK_LOOKUP);
3481 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3482 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3484 /* squash fsuid/fsgid if needed */
3485 sbi = ll_i2sbi(inode);
3486 squash = &sbi->ll_squash;
3487 if (unlikely(squash->rsi_uid != 0 &&
3488 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3489 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3493 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3494 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3495 squash->rsi_uid, squash->rsi_gid);
3497 /* update current process's credentials
3498 * and FS capability */
3499 cred = prepare_creds();
3503 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3504 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3505 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3506 if ((1 << cap) & CFS_CAP_FS_MASK)
3507 cap_lower(cred->cap_effective, cap);
3509 old_cred = override_creds(cred);
3512 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3514 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3515 rc = lustre_check_remote_perm(inode, mask);
3517 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3519 /* restore current process's credentials and FS capability */
3521 revert_creds(old_cred);
3528 /* -o localflock - only provides locally consistent flock locks */
3529 struct file_operations ll_file_operations = {
3530 .read = ll_file_read,
3531 .aio_read = ll_file_aio_read,
3532 .write = ll_file_write,
3533 .aio_write = ll_file_aio_write,
3534 .unlocked_ioctl = ll_file_ioctl,
3535 .open = ll_file_open,
3536 .release = ll_file_release,
3537 .mmap = ll_file_mmap,
3538 .llseek = ll_file_seek,
3539 .splice_read = ll_file_splice_read,
3544 struct file_operations ll_file_operations_flock = {
3545 .read = ll_file_read,
3546 .aio_read = ll_file_aio_read,
3547 .write = ll_file_write,
3548 .aio_write = ll_file_aio_write,
3549 .unlocked_ioctl = ll_file_ioctl,
3550 .open = ll_file_open,
3551 .release = ll_file_release,
3552 .mmap = ll_file_mmap,
3553 .llseek = ll_file_seek,
3554 .splice_read = ll_file_splice_read,
3557 .flock = ll_file_flock,
3558 .lock = ll_file_flock
3561 /* These are for -o noflock - to return ENOSYS on flock calls */
3562 struct file_operations ll_file_operations_noflock = {
3563 .read = ll_file_read,
3564 .aio_read = ll_file_aio_read,
3565 .write = ll_file_write,
3566 .aio_write = ll_file_aio_write,
3567 .unlocked_ioctl = ll_file_ioctl,
3568 .open = ll_file_open,
3569 .release = ll_file_release,
3570 .mmap = ll_file_mmap,
3571 .llseek = ll_file_seek,
3572 .splice_read = ll_file_splice_read,
3575 .flock = ll_file_noflock,
3576 .lock = ll_file_noflock
3579 struct inode_operations ll_file_inode_operations = {
3580 .setattr = ll_setattr,
3581 .getattr = ll_getattr,
3582 .permission = ll_inode_permission,
3583 .setxattr = ll_setxattr,
3584 .getxattr = ll_getxattr,
3585 .listxattr = ll_listxattr,
3586 .removexattr = ll_removexattr,
3587 .fiemap = ll_fiemap,
3588 #ifdef HAVE_IOP_GET_ACL
3589 .get_acl = ll_get_acl,
3593 /* dynamic ioctl number support routins */
3594 static struct llioc_ctl_data {
3595 struct rw_semaphore ioc_sem;
3596 struct list_head ioc_head;
3598 __RWSEM_INITIALIZER(llioc.ioc_sem),
3599 LIST_HEAD_INIT(llioc.ioc_head)
3604 struct list_head iocd_list;
3605 unsigned int iocd_size;
3606 llioc_callback_t iocd_cb;
3607 unsigned int iocd_count;
3608 unsigned int iocd_cmd[0];
3611 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3614 struct llioc_data *in_data = NULL;
3617 if (cb == NULL || cmd == NULL ||
3618 count > LLIOC_MAX_CMD || count < 0)
3621 size = sizeof(*in_data) + count * sizeof(unsigned int);
3622 OBD_ALLOC(in_data, size);
3623 if (in_data == NULL)
3626 memset(in_data, 0, sizeof(*in_data));
3627 in_data->iocd_size = size;
3628 in_data->iocd_cb = cb;
3629 in_data->iocd_count = count;
3630 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3632 down_write(&llioc.ioc_sem);
3633 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3634 up_write(&llioc.ioc_sem);
3639 void ll_iocontrol_unregister(void *magic)
3641 struct llioc_data *tmp;
3646 down_write(&llioc.ioc_sem);
3647 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3649 unsigned int size = tmp->iocd_size;
3651 list_del(&tmp->iocd_list);
3652 up_write(&llioc.ioc_sem);
3654 OBD_FREE(tmp, size);
3658 up_write(&llioc.ioc_sem);
3660 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3663 EXPORT_SYMBOL(ll_iocontrol_register);
3664 EXPORT_SYMBOL(ll_iocontrol_unregister);
3666 static enum llioc_iter
3667 ll_iocontrol_call(struct inode *inode, struct file *file,
3668 unsigned int cmd, unsigned long arg, int *rcp)
3670 enum llioc_iter ret = LLIOC_CONT;
3671 struct llioc_data *data;
3672 int rc = -EINVAL, i;
3674 down_read(&llioc.ioc_sem);
3675 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3676 for (i = 0; i < data->iocd_count; i++) {
3677 if (cmd != data->iocd_cmd[i])
3680 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3684 if (ret == LLIOC_STOP)
3687 up_read(&llioc.ioc_sem);
3694 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3696 struct ll_inode_info *lli = ll_i2info(inode);
3697 struct cl_env_nest nest;
3702 if (lli->lli_clob == NULL)
3705 env = cl_env_nested_get(&nest);
3707 RETURN(PTR_ERR(env));
3709 result = cl_conf_set(env, lli->lli_clob, conf);
3710 cl_env_nested_put(&nest, env);
3712 if (conf->coc_opc == OBJECT_CONF_SET) {
3713 struct ldlm_lock *lock = conf->coc_lock;
3715 LASSERT(lock != NULL);
3716 LASSERT(ldlm_has_layout(lock));
3718 struct lustre_md *md = conf->u.coc_md;
3719 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3721 /* it can only be allowed to match after layout is
3722 * applied to inode otherwise false layout would be
3723 * seen. Applying layout shoud happen before dropping
3724 * the intent lock. */
3725 ldlm_lock_allow_match(lock);
3727 lli->lli_has_smd = lsm_has_objects(md->lsm);
3728 if (md->lsm != NULL)
3729 gen = md->lsm->lsm_layout_gen;
3732 DFID ": layout version change: %u -> %u\n",
3733 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3735 ll_layout_version_set(lli, gen);
3741 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3742 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3745 struct ll_sb_info *sbi = ll_i2sbi(inode);
3746 struct obd_capa *oc;
3747 struct ptlrpc_request *req;
3748 struct mdt_body *body;
3755 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3756 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3757 lock->l_lvb_data, lock->l_lvb_len);
3759 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3762 /* if layout lock was granted right away, the layout is returned
3763 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3764 * blocked and then granted via completion ast, we have to fetch
3765 * layout here. Please note that we can't use the LVB buffer in
3766 * completion AST because it doesn't have a large enough buffer */
3767 oc = ll_mdscapa_get(inode);
3768 rc = ll_get_default_mdsize(sbi, &lmmsize);
3770 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3771 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3777 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3779 GOTO(out, rc = -EPROTO);
3781 lmmsize = body->mbo_eadatasize;
3782 if (lmmsize == 0) /* empty layout */
3785 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3787 GOTO(out, rc = -EFAULT);
3789 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3790 if (lvbdata == NULL)
3791 GOTO(out, rc = -ENOMEM);
3793 memcpy(lvbdata, lmm, lmmsize);
3794 lock_res_and_lock(lock);
3795 if (lock->l_lvb_data != NULL)
3796 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3798 lock->l_lvb_data = lvbdata;
3799 lock->l_lvb_len = lmmsize;
3800 unlock_res_and_lock(lock);
3805 ptlrpc_req_finished(req);
3810 * Apply the layout to the inode. Layout lock is held and will be released
3813 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3814 struct inode *inode, __u32 *gen, bool reconf)
3816 struct ll_inode_info *lli = ll_i2info(inode);
3817 struct ll_sb_info *sbi = ll_i2sbi(inode);
3818 struct ldlm_lock *lock;
3819 struct lustre_md md = { NULL };
3820 struct cl_object_conf conf;
3823 bool wait_layout = false;
3826 LASSERT(lustre_handle_is_used(lockh));
3828 lock = ldlm_handle2lock(lockh);
3829 LASSERT(lock != NULL);
3830 LASSERT(ldlm_has_layout(lock));
3832 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3833 PFID(&lli->lli_fid), inode, reconf);
3835 /* in case this is a caching lock and reinstate with new inode */
3836 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3838 lock_res_and_lock(lock);
3839 lvb_ready = ldlm_is_lvb_ready(lock);
3840 unlock_res_and_lock(lock);
3841 /* checking lvb_ready is racy but this is okay. The worst case is
3842 * that multi processes may configure the file on the same time. */
3844 if (lvb_ready || !reconf) {
3847 /* layout_gen must be valid if layout lock is not
3848 * cancelled and stripe has already set */
3849 *gen = ll_layout_version_get(lli);
3855 rc = ll_layout_fetch(inode, lock);
3859 /* for layout lock, lmm is returned in lock's lvb.
3860 * lvb_data is immutable if the lock is held so it's safe to access it
3861 * without res lock. See the description in ldlm_lock_decref_internal()
3862 * for the condition to free lvb_data of layout lock */
3863 if (lock->l_lvb_data != NULL) {
3864 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3865 lock->l_lvb_data, lock->l_lvb_len);
3867 *gen = LL_LAYOUT_GEN_EMPTY;
3869 *gen = md.lsm->lsm_layout_gen;
3872 CERROR("%s: file "DFID" unpackmd error: %d\n",
3873 ll_get_fsname(inode->i_sb, NULL, 0),
3874 PFID(&lli->lli_fid), rc);
3880 /* set layout to file. Unlikely this will fail as old layout was
3881 * surely eliminated */
3882 memset(&conf, 0, sizeof conf);
3883 conf.coc_opc = OBJECT_CONF_SET;
3884 conf.coc_inode = inode;
3885 conf.coc_lock = lock;
3886 conf.u.coc_md = &md;
3887 rc = ll_layout_conf(inode, &conf);
3890 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3892 /* refresh layout failed, need to wait */
3893 wait_layout = rc == -EBUSY;
3897 LDLM_LOCK_PUT(lock);
3898 ldlm_lock_decref(lockh, mode);
3900 /* wait for IO to complete if it's still being used. */
3902 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3903 ll_get_fsname(inode->i_sb, NULL, 0),
3904 PFID(&lli->lli_fid), inode);
3906 memset(&conf, 0, sizeof conf);
3907 conf.coc_opc = OBJECT_CONF_WAIT;
3908 conf.coc_inode = inode;
3909 rc = ll_layout_conf(inode, &conf);
3913 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3914 ll_get_fsname(inode->i_sb, NULL, 0),
3915 PFID(&lli->lli_fid), rc);
3921 * This function checks if there exists a LAYOUT lock on the client side,
3922 * or enqueues it if it doesn't have one in cache.
3924 * This function will not hold layout lock so it may be revoked any time after
3925 * this function returns. Any operations depend on layout should be redone
3928 * This function should be called before lov_io_init() to get an uptodate
3929 * layout version, the caller should save the version number and after IO
3930 * is finished, this function should be called again to verify that layout
3931 * is not changed during IO time.
3933 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3935 struct ll_inode_info *lli = ll_i2info(inode);
3936 struct ll_sb_info *sbi = ll_i2sbi(inode);
3937 struct md_op_data *op_data;
3938 struct lookup_intent it;
3939 struct lustre_handle lockh;
3941 struct ldlm_enqueue_info einfo = {
3942 .ei_type = LDLM_IBITS,
3944 .ei_cb_bl = &ll_md_blocking_ast,
3945 .ei_cb_cp = &ldlm_completion_ast,
3950 *gen = ll_layout_version_get(lli);
3951 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3955 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3956 LASSERT(S_ISREG(inode->i_mode));
3958 /* take layout lock mutex to enqueue layout lock exclusively. */
3959 mutex_lock(&lli->lli_layout_mutex);
3962 /* mostly layout lock is caching on the local side, so try to match
3963 * it before grabbing layout lock mutex. */
3964 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3965 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3966 if (mode != 0) { /* hit cached lock */
3967 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3971 mutex_unlock(&lli->lli_layout_mutex);
3975 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3976 0, 0, LUSTRE_OPC_ANY, NULL);
3977 if (IS_ERR(op_data)) {
3978 mutex_unlock(&lli->lli_layout_mutex);
3979 RETURN(PTR_ERR(op_data));
3982 /* have to enqueue one */
3983 memset(&it, 0, sizeof(it));
3984 it.it_op = IT_LAYOUT;
3985 lockh.cookie = 0ULL;
3987 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3988 ll_get_fsname(inode->i_sb, NULL, 0),
3989 PFID(&lli->lli_fid), inode);
3991 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3992 if (it.d.lustre.it_data != NULL)
3993 ptlrpc_req_finished(it.d.lustre.it_data);
3994 it.d.lustre.it_data = NULL;
3996 ll_finish_md_op_data(op_data);
3998 mode = it.d.lustre.it_lock_mode;
3999 it.d.lustre.it_lock_mode = 0;
4000 ll_intent_drop_lock(&it);
4003 /* set lock data in case this is a new lock */
4004 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4005 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4009 mutex_unlock(&lli->lli_layout_mutex);
4015 * This function send a restore request to the MDT
4017 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4019 struct hsm_user_request *hur;
4023 len = sizeof(struct hsm_user_request) +
4024 sizeof(struct hsm_user_item);
4025 OBD_ALLOC(hur, len);
4029 hur->hur_request.hr_action = HUA_RESTORE;
4030 hur->hur_request.hr_archive_id = 0;
4031 hur->hur_request.hr_flags = 0;
4032 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4033 sizeof(hur->hur_user_item[0].hui_fid));
4034 hur->hur_user_item[0].hui_extent.offset = offset;
4035 hur->hur_user_item[0].hui_extent.length = length;
4036 hur->hur_request.hr_itemcount = 1;
4037 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,