4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
198 rc = ll_objects_destroy(req, inode);
200 CERROR("%s: inode "DFID
201 " ll_objects destroy: rc = %d\n",
202 ll_i2mdexp(inode)->exp_obd->obd_name,
203 PFID(ll_inode2fid(inode)), rc);
206 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
207 struct mdt_body *body;
208 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
209 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
213 ll_finish_md_op_data(op_data);
217 if (exp_connect_som(exp) && !epoch_close &&
218 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
219 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
221 md_clear_open_replay_data(md_exp, och);
222 /* Free @och if it is not waiting for DONE_WRITING. */
223 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
226 if (req) /* This is close request */
227 ptlrpc_req_finished(req);
231 int ll_md_real_close(struct inode *inode, fmode_t fmode)
233 struct ll_inode_info *lli = ll_i2info(inode);
234 struct obd_client_handle **och_p;
235 struct obd_client_handle *och;
240 if (fmode & FMODE_WRITE) {
241 och_p = &lli->lli_mds_write_och;
242 och_usecount = &lli->lli_open_fd_write_count;
243 } else if (fmode & FMODE_EXEC) {
244 och_p = &lli->lli_mds_exec_och;
245 och_usecount = &lli->lli_open_fd_exec_count;
247 LASSERT(fmode & FMODE_READ);
248 och_p = &lli->lli_mds_read_och;
249 och_usecount = &lli->lli_open_fd_read_count;
252 mutex_lock(&lli->lli_och_mutex);
253 if (*och_usecount > 0) {
254 /* There are still users of this handle, so skip
256 mutex_unlock(&lli->lli_och_mutex);
262 mutex_unlock(&lli->lli_och_mutex);
265 /* There might be a race and this handle may already
267 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
274 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
278 struct ll_inode_info *lli = ll_i2info(inode);
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("released file has negative dentry: file = %p, "
337 "dentry = %p, name = %s\n",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead,
385 * because parent and child process can share the same file handle. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
387 ll_deauthorize_statahead(inode, fd);
389 if (inode->i_sb->s_root == file->f_dentry) {
390 LUSTRE_FPRIVATE(file) = NULL;
391 ll_file_data_put(fd);
395 if (!S_ISDIR(inode->i_mode)) {
396 if (lli->lli_clob != NULL)
397 lov_read_and_clear_async_rc(lli->lli_clob);
398 lli->lli_async_rc = 0;
401 rc = ll_md_close(sbi->ll_md_exp, inode, file);
403 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
404 libcfs_debug_dumplog();
409 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
410 struct lookup_intent *itp)
412 struct dentry *de = file->f_dentry;
413 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
414 struct dentry *parent = de->d_parent;
415 const char *name = NULL;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req = NULL;
422 LASSERT(parent != NULL);
423 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
425 /* if server supports open-by-fid, or file name is invalid, don't pack
426 * name in open request */
427 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
428 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
429 name = de->d_name.name;
430 len = de->d_name.len;
433 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
434 name, len, 0, LUSTRE_OPC_ANY, NULL);
436 RETURN(PTR_ERR(op_data));
437 op_data->op_data = lmm;
438 op_data->op_data_size = lmmsize;
440 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
441 &ll_md_blocking_ast, 0);
442 ll_finish_md_op_data(op_data);
444 /* reason for keep own exit path - don`t flood log
445 * with messages with -ESTALE errors.
447 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
448 it_open_error(DISP_OPEN_OPEN, itp))
450 ll_release_openhandle(de, itp);
454 if (it_disposition(itp, DISP_LOOKUP_NEG))
455 GOTO(out, rc = -ENOENT);
457 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
458 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
459 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
463 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
464 if (!rc && itp->d.lustre.it_lock_mode)
465 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
468 ptlrpc_req_finished(req);
469 ll_intent_drop_lock(itp);
475 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
476 * not believe attributes if a few ioepoch holders exist. Attributes for
477 * previous ioepoch if new one is opened are also skipped by MDS.
479 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
481 if (ioepoch && lli->lli_ioepoch != ioepoch) {
482 lli->lli_ioepoch = ioepoch;
483 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
484 ioepoch, PFID(&lli->lli_fid));
488 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
489 struct obd_client_handle *och)
491 struct ptlrpc_request *req = it->d.lustre.it_data;
492 struct mdt_body *body;
494 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
495 och->och_fh = body->mbo_handle;
496 och->och_fid = body->mbo_fid1;
497 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
498 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
499 och->och_flags = it->it_flags;
501 return md_set_open_replay_data(md_exp, och, it);
504 static int ll_local_open(struct file *file, struct lookup_intent *it,
505 struct ll_file_data *fd, struct obd_client_handle *och)
507 struct inode *inode = file->f_dentry->d_inode;
508 struct ll_inode_info *lli = ll_i2info(inode);
511 LASSERT(!LUSTRE_FPRIVATE(file));
516 struct ptlrpc_request *req = it->d.lustre.it_data;
517 struct mdt_body *body;
520 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
524 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
525 ll_ioepoch_open(lli, body->mbo_ioepoch);
528 LUSTRE_FPRIVATE(file) = fd;
529 ll_readahead_init(inode, &fd->fd_ras);
530 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
532 /* ll_cl_context initialize */
533 rwlock_init(&fd->fd_lock);
534 INIT_LIST_HEAD(&fd->fd_lccs);
539 /* Open a file, and (for the very first open) create objects on the OSTs at
540 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
541 * creation or open until ll_lov_setstripe() ioctl is called.
543 * If we already have the stripe MD locally then we don't request it in
544 * md_open(), by passing a lmm_size = 0.
546 * It is up to the application to ensure no other processes open this file
547 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
548 * used. We might be able to avoid races of that sort by getting lli_open_sem
549 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
550 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
552 int ll_file_open(struct inode *inode, struct file *file)
554 struct ll_inode_info *lli = ll_i2info(inode);
555 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
556 .it_flags = file->f_flags };
557 struct obd_client_handle **och_p = NULL;
558 __u64 *och_usecount = NULL;
559 struct ll_file_data *fd;
563 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
564 PFID(ll_inode2fid(inode)), inode, file->f_flags);
566 it = file->private_data; /* XXX: compat macro */
567 file->private_data = NULL; /* prevent ll_local_open assertion */
569 fd = ll_file_data_get();
571 GOTO(out_openerr, rc = -ENOMEM);
574 if (S_ISDIR(inode->i_mode))
575 ll_authorize_statahead(inode, fd);
577 if (inode->i_sb->s_root == file->f_dentry) {
578 LUSTRE_FPRIVATE(file) = fd;
582 if (!it || !it->d.lustre.it_disposition) {
583 /* Convert f_flags into access mode. We cannot use file->f_mode,
584 * because everything but O_ACCMODE mask was stripped from
586 if ((oit.it_flags + 1) & O_ACCMODE)
588 if (file->f_flags & O_TRUNC)
589 oit.it_flags |= FMODE_WRITE;
591 /* kernel only call f_op->open in dentry_open. filp_open calls
592 * dentry_open after call to open_namei that checks permissions.
593 * Only nfsd_open call dentry_open directly without checking
594 * permissions and because of that this code below is safe. */
595 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
596 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
598 /* We do not want O_EXCL here, presumably we opened the file
599 * already? XXX - NFS implications? */
600 oit.it_flags &= ~O_EXCL;
602 /* bug20584, if "it_flags" contains O_CREAT, the file will be
603 * created if necessary, then "IT_CREAT" should be set to keep
604 * consistent with it */
605 if (oit.it_flags & O_CREAT)
606 oit.it_op |= IT_CREAT;
612 /* Let's see if we have file open on MDS already. */
613 if (it->it_flags & FMODE_WRITE) {
614 och_p = &lli->lli_mds_write_och;
615 och_usecount = &lli->lli_open_fd_write_count;
616 } else if (it->it_flags & FMODE_EXEC) {
617 och_p = &lli->lli_mds_exec_och;
618 och_usecount = &lli->lli_open_fd_exec_count;
620 och_p = &lli->lli_mds_read_och;
621 och_usecount = &lli->lli_open_fd_read_count;
624 mutex_lock(&lli->lli_och_mutex);
625 if (*och_p) { /* Open handle is present */
626 if (it_disposition(it, DISP_OPEN_OPEN)) {
627 /* Well, there's extra open request that we do not need,
628 let's close it somehow. This will decref request. */
629 rc = it_open_error(DISP_OPEN_OPEN, it);
631 mutex_unlock(&lli->lli_och_mutex);
632 GOTO(out_openerr, rc);
635 ll_release_openhandle(file->f_dentry, it);
639 rc = ll_local_open(file, it, fd, NULL);
642 mutex_unlock(&lli->lli_och_mutex);
643 GOTO(out_openerr, rc);
646 LASSERT(*och_usecount == 0);
647 if (!it->d.lustre.it_disposition) {
648 /* We cannot just request lock handle now, new ELC code
649 means that one of other OPEN locks for this file
650 could be cancelled, and since blocking ast handler
651 would attempt to grab och_mutex as well, that would
652 result in a deadlock */
653 mutex_unlock(&lli->lli_och_mutex);
655 * Normally called under two situations:
657 * 2. A race/condition on MDS resulting in no open
658 * handle to be returned from LOOKUP|OPEN request,
659 * for example if the target entry was a symlink.
661 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
663 * Always specify MDS_OPEN_BY_FID because we don't want
664 * to get file with different fid.
666 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
667 rc = ll_intent_file_open(file, NULL, 0, it);
669 GOTO(out_openerr, rc);
673 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
675 GOTO(out_och_free, rc = -ENOMEM);
679 /* md_intent_lock() didn't get a request ref if there was an
680 * open error, so don't do cleanup on the request here
682 /* XXX (green): Should not we bail out on any error here, not
683 * just open error? */
684 rc = it_open_error(DISP_OPEN_OPEN, it);
686 GOTO(out_och_free, rc);
688 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
689 "inode %p: disposition %x, status %d\n", inode,
690 it_disposition(it, ~0), it->d.lustre.it_status);
692 rc = ll_local_open(file, it, fd, *och_p);
694 GOTO(out_och_free, rc);
696 mutex_unlock(&lli->lli_och_mutex);
699 /* Must do this outside lli_och_mutex lock to prevent deadlock where
700 different kind of OPEN lock for this same inode gets cancelled
701 by ldlm_cancel_lru */
702 if (!S_ISREG(inode->i_mode))
703 GOTO(out_och_free, rc);
707 if (!lli->lli_has_smd &&
708 (cl_is_lov_delay_create(file->f_flags) ||
709 (file->f_mode & FMODE_WRITE) == 0)) {
710 CDEBUG(D_INODE, "object creation was delayed\n");
711 GOTO(out_och_free, rc);
713 cl_lov_delay_create_clear(&file->f_flags);
714 GOTO(out_och_free, rc);
718 if (och_p && *och_p) {
719 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
720 *och_p = NULL; /* OBD_FREE writes some magic there */
723 mutex_unlock(&lli->lli_och_mutex);
726 if (lli->lli_opendir_key == fd)
727 ll_deauthorize_statahead(inode, fd);
729 ll_file_data_put(fd);
731 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
734 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
735 ptlrpc_req_finished(it->d.lustre.it_data);
736 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
742 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
743 struct ldlm_lock_desc *desc, void *data, int flag)
746 struct lustre_handle lockh;
750 case LDLM_CB_BLOCKING:
751 ldlm_lock2handle(lock, &lockh);
752 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
754 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
758 case LDLM_CB_CANCELING:
766 * Acquire a lease and open the file.
768 static struct obd_client_handle *
769 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
772 struct lookup_intent it = { .it_op = IT_OPEN };
773 struct ll_sb_info *sbi = ll_i2sbi(inode);
774 struct md_op_data *op_data;
775 struct ptlrpc_request *req = NULL;
776 struct lustre_handle old_handle = { 0 };
777 struct obd_client_handle *och = NULL;
782 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
783 RETURN(ERR_PTR(-EINVAL));
786 struct ll_inode_info *lli = ll_i2info(inode);
787 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
788 struct obd_client_handle **och_p;
791 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
792 RETURN(ERR_PTR(-EPERM));
794 /* Get the openhandle of the file */
796 mutex_lock(&lli->lli_och_mutex);
797 if (fd->fd_lease_och != NULL) {
798 mutex_unlock(&lli->lli_och_mutex);
802 if (fd->fd_och == NULL) {
803 if (file->f_mode & FMODE_WRITE) {
804 LASSERT(lli->lli_mds_write_och != NULL);
805 och_p = &lli->lli_mds_write_och;
806 och_usecount = &lli->lli_open_fd_write_count;
808 LASSERT(lli->lli_mds_read_och != NULL);
809 och_p = &lli->lli_mds_read_och;
810 och_usecount = &lli->lli_open_fd_read_count;
812 if (*och_usecount == 1) {
819 mutex_unlock(&lli->lli_och_mutex);
820 if (rc < 0) /* more than 1 opener */
823 LASSERT(fd->fd_och != NULL);
824 old_handle = fd->fd_och->och_fh;
829 RETURN(ERR_PTR(-ENOMEM));
831 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
832 LUSTRE_OPC_ANY, NULL);
834 GOTO(out, rc = PTR_ERR(op_data));
836 /* To tell the MDT this openhandle is from the same owner */
837 op_data->op_handle = old_handle;
839 it.it_flags = fmode | open_flags;
840 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
841 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
842 &ll_md_blocking_lease_ast,
843 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
844 * it can be cancelled which may mislead applications that the lease is
846 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
847 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
848 * doesn't deal with openhandle, so normal openhandle will be leaked. */
849 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
850 ll_finish_md_op_data(op_data);
851 ptlrpc_req_finished(req);
853 GOTO(out_release_it, rc);
855 if (it_disposition(&it, DISP_LOOKUP_NEG))
856 GOTO(out_release_it, rc = -ENOENT);
858 rc = it_open_error(DISP_OPEN_OPEN, &it);
860 GOTO(out_release_it, rc);
862 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
863 ll_och_fill(sbi->ll_md_exp, &it, och);
865 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
866 GOTO(out_close, rc = -EOPNOTSUPP);
868 /* already get lease, handle lease lock */
869 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
870 if (it.d.lustre.it_lock_mode == 0 ||
871 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
872 /* open lock must return for lease */
873 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
874 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
875 it.d.lustre.it_lock_bits);
876 GOTO(out_close, rc = -EPROTO);
879 ll_intent_release(&it);
883 /* Cancel open lock */
884 if (it.d.lustre.it_lock_mode != 0) {
885 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
886 it.d.lustre.it_lock_mode);
887 it.d.lustre.it_lock_mode = 0;
888 och->och_lease_handle.cookie = 0ULL;
890 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
892 CERROR("%s: error closing file "DFID": %d\n",
893 ll_get_fsname(inode->i_sb, NULL, 0),
894 PFID(&ll_i2info(inode)->lli_fid), rc2);
895 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
897 ll_intent_release(&it);
905 * Release lease and close the file.
906 * It will check if the lease has ever broken.
908 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
911 struct ldlm_lock *lock;
912 bool cancelled = true;
916 lock = ldlm_handle2lock(&och->och_lease_handle);
918 lock_res_and_lock(lock);
919 cancelled = ldlm_is_cancel(lock);
920 unlock_res_and_lock(lock);
924 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
925 PFID(&ll_i2info(inode)->lli_fid), cancelled);
928 ldlm_cli_cancel(&och->och_lease_handle, 0);
929 if (lease_broken != NULL)
930 *lease_broken = cancelled;
932 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
937 /* Fills the obdo with the attributes for the lsm */
938 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
939 struct obd_capa *capa, struct obdo *obdo,
940 __u64 ioepoch, int dv_flags)
942 struct ptlrpc_request_set *set;
943 struct obd_info oinfo = { { { 0 } } };
948 LASSERT(lsm != NULL);
952 oinfo.oi_oa->o_oi = lsm->lsm_oi;
953 oinfo.oi_oa->o_mode = S_IFREG;
954 oinfo.oi_oa->o_ioepoch = ioepoch;
955 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
956 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
957 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
958 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
959 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
960 OBD_MD_FLDATAVERSION;
961 oinfo.oi_capa = capa;
962 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
963 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
964 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
965 if (dv_flags & LL_DV_WR_FLUSH)
966 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
969 set = ptlrpc_prep_set();
971 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
974 rc = obd_getattr_async(exp, &oinfo, set);
976 rc = ptlrpc_set_wait(set);
977 ptlrpc_set_destroy(set);
980 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
981 OBD_MD_FLATIME | OBD_MD_FLMTIME |
982 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
983 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
984 if (dv_flags & LL_DV_WR_FLUSH &&
985 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
986 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
993 * Performs the getattr on the inode and updates its fields.
994 * If @sync != 0, perform the getattr under the server-side lock.
996 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
997 __u64 ioepoch, int sync)
999 struct obd_capa *capa = ll_mdscapa_get(inode);
1000 struct lov_stripe_md *lsm;
1004 lsm = ccc_inode_lsm_get(inode);
1005 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1006 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1009 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1011 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1012 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1013 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1014 (unsigned long long)inode->i_blocks,
1015 1UL << inode->i_blkbits);
1017 ccc_inode_lsm_put(inode, lsm);
1021 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1023 struct ll_inode_info *lli = ll_i2info(inode);
1024 struct cl_object *obj = lli->lli_clob;
1025 struct cl_attr *attr = ccc_env_thread_attr(env);
1031 ll_inode_size_lock(inode);
1032 /* merge timestamps the most recently obtained from mds with
1033 timestamps obtained from osts */
1034 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1035 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1036 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1038 lvb.lvb_size = i_size_read(inode);
1039 lvb.lvb_blocks = inode->i_blocks;
1040 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1041 lvb.lvb_atime = LTIME_S(inode->i_atime);
1042 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1044 cl_object_attr_lock(obj);
1045 rc = cl_object_attr_get(env, obj, attr);
1046 cl_object_attr_unlock(obj);
1049 if (lvb.lvb_atime < attr->cat_atime)
1050 lvb.lvb_atime = attr->cat_atime;
1051 if (lvb.lvb_ctime < attr->cat_ctime)
1052 lvb.lvb_ctime = attr->cat_ctime;
1053 if (lvb.lvb_mtime < attr->cat_mtime)
1054 lvb.lvb_mtime = attr->cat_mtime;
1056 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1057 PFID(&lli->lli_fid), attr->cat_size);
1058 cl_isize_write_nolock(inode, attr->cat_size);
1060 inode->i_blocks = attr->cat_blocks;
1062 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1063 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1064 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1066 ll_inode_size_unlock(inode);
1071 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1074 struct obdo obdo = { 0 };
1077 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1079 st->st_size = obdo.o_size;
1080 st->st_blocks = obdo.o_blocks;
1081 st->st_mtime = obdo.o_mtime;
1082 st->st_atime = obdo.o_atime;
1083 st->st_ctime = obdo.o_ctime;
1088 static bool file_is_noatime(const struct file *file)
1090 const struct vfsmount *mnt = file->f_path.mnt;
1091 const struct inode *inode = file->f_path.dentry->d_inode;
1093 /* Adapted from file_accessed() and touch_atime().*/
1094 if (file->f_flags & O_NOATIME)
1097 if (inode->i_flags & S_NOATIME)
1100 if (IS_NOATIME(inode))
1103 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1106 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1109 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1115 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1117 struct inode *inode = file->f_dentry->d_inode;
1119 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1121 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1122 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1123 file->f_flags & O_DIRECT ||
1126 io->ci_obj = ll_i2info(inode)->lli_clob;
1127 io->ci_lockreq = CILR_MAYBE;
1128 if (ll_file_nolock(file)) {
1129 io->ci_lockreq = CILR_NEVER;
1130 io->ci_no_srvlock = 1;
1131 } else if (file->f_flags & O_APPEND) {
1132 io->ci_lockreq = CILR_MANDATORY;
1135 io->ci_noatime = file_is_noatime(file);
1139 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1140 struct file *file, enum cl_io_type iot,
1141 loff_t *ppos, size_t count)
1143 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147 struct range_lock range;
1150 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1151 file->f_dentry->d_name.name, iot, *ppos, count);
1154 io = ccc_env_thread_io(env);
1155 ll_io_init(io, file, iot == CIT_WRITE);
1157 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1158 struct vvp_io *vio = vvp_env_io(env);
1159 struct ccc_io *cio = ccc_env_io(env);
1160 bool range_locked = false;
1162 if (file->f_flags & O_APPEND)
1163 range_lock_init(&range, 0, LUSTRE_EOF);
1165 range_lock_init(&range, *ppos, *ppos + count - 1);
1166 cio->cui_fd = LUSTRE_FPRIVATE(file);
1167 vio->cui_io_subtype = args->via_io_subtype;
1169 switch (vio->cui_io_subtype) {
1171 cio->cui_iov = args->u.normal.via_iov;
1172 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1173 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1174 cio->cui_iocb = args->u.normal.via_iocb;
1175 if ((iot == CIT_WRITE) &&
1176 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1177 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1179 result = range_lock(&lli->lli_write_tree,
1184 range_locked = true;
1186 down_read(&lli->lli_trunc_sem);
1189 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1190 vio->u.splice.cui_flags = args->u.splice.via_flags;
1193 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1197 ll_cl_add(file, env, io);
1198 result = cl_io_loop(env, io);
1199 ll_cl_remove(file, env);
1201 if (args->via_io_subtype == IO_NORMAL)
1202 up_read(&lli->lli_trunc_sem);
1204 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1206 range_unlock(&lli->lli_write_tree, &range);
1209 /* cl_io_rw_init() handled IO */
1210 result = io->ci_result;
1213 if (io->ci_nob > 0) {
1214 result = io->ci_nob;
1215 *ppos = io->u.ci_wr.wr.crw_pos;
1219 cl_io_fini(env, io);
1220 /* If any bit been read/written (result != 0), we just return
1221 * short read/write instead of restart io. */
1222 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1223 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1224 iot == CIT_READ ? "read" : "write",
1225 file->f_dentry->d_name.name, *ppos, count);
1226 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1230 if (iot == CIT_READ) {
1232 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1233 LPROC_LL_READ_BYTES, result);
1234 } else if (iot == CIT_WRITE) {
1236 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1237 LPROC_LL_WRITE_BYTES, result);
1238 fd->fd_write_failed = false;
1239 } else if (result != -ERESTARTSYS) {
1240 fd->fd_write_failed = true;
1243 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1250 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1252 static int ll_file_get_iov_count(const struct iovec *iov,
1253 unsigned long *nr_segs, size_t *count)
1258 for (seg = 0; seg < *nr_segs; seg++) {
1259 const struct iovec *iv = &iov[seg];
1262 * If any segment has a negative length, or the cumulative
1263 * length ever wraps negative then return -EINVAL.
1266 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1268 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1273 cnt -= iv->iov_len; /* This segment is no good */
1280 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1281 unsigned long nr_segs, loff_t pos)
1284 struct vvp_io_args *args;
1290 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1294 env = cl_env_get(&refcheck);
1296 RETURN(PTR_ERR(env));
1298 args = vvp_env_args(env, IO_NORMAL);
1299 args->u.normal.via_iov = (struct iovec *)iov;
1300 args->u.normal.via_nrsegs = nr_segs;
1301 args->u.normal.via_iocb = iocb;
1303 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1304 &iocb->ki_pos, count);
1305 cl_env_put(env, &refcheck);
1309 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1313 struct iovec *local_iov;
1314 struct kiocb *kiocb;
1319 env = cl_env_get(&refcheck);
1321 RETURN(PTR_ERR(env));
1323 local_iov = &vvp_env_info(env)->vti_local_iov;
1324 kiocb = &vvp_env_info(env)->vti_kiocb;
1325 local_iov->iov_base = (void __user *)buf;
1326 local_iov->iov_len = count;
1327 init_sync_kiocb(kiocb, file);
1328 kiocb->ki_pos = *ppos;
1329 #ifdef HAVE_KIOCB_KI_LEFT
1330 kiocb->ki_left = count;
1332 kiocb->ki_nbytes = count;
1335 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1336 *ppos = kiocb->ki_pos;
1338 cl_env_put(env, &refcheck);
1343 * Write to a file (through the page cache).
1346 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1347 unsigned long nr_segs, loff_t pos)
1350 struct vvp_io_args *args;
1356 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1360 env = cl_env_get(&refcheck);
1362 RETURN(PTR_ERR(env));
1364 args = vvp_env_args(env, IO_NORMAL);
1365 args->u.normal.via_iov = (struct iovec *)iov;
1366 args->u.normal.via_nrsegs = nr_segs;
1367 args->u.normal.via_iocb = iocb;
1369 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1370 &iocb->ki_pos, count);
1371 cl_env_put(env, &refcheck);
1375 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1376 size_t count, loff_t *ppos)
1379 struct iovec *local_iov;
1380 struct kiocb *kiocb;
1385 env = cl_env_get(&refcheck);
1387 RETURN(PTR_ERR(env));
1389 local_iov = &vvp_env_info(env)->vti_local_iov;
1390 kiocb = &vvp_env_info(env)->vti_kiocb;
1391 local_iov->iov_base = (void __user *)buf;
1392 local_iov->iov_len = count;
1393 init_sync_kiocb(kiocb, file);
1394 kiocb->ki_pos = *ppos;
1395 #ifdef HAVE_KIOCB_KI_LEFT
1396 kiocb->ki_left = count;
1398 kiocb->ki_nbytes = count;
1401 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1402 *ppos = kiocb->ki_pos;
1404 cl_env_put(env, &refcheck);
1409 * Send file content (through pagecache) somewhere with helper
1411 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1412 struct pipe_inode_info *pipe, size_t count,
1416 struct vvp_io_args *args;
1421 env = cl_env_get(&refcheck);
1423 RETURN(PTR_ERR(env));
1425 args = vvp_env_args(env, IO_SPLICE);
1426 args->u.splice.via_pipe = pipe;
1427 args->u.splice.via_flags = flags;
1429 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1430 cl_env_put(env, &refcheck);
1434 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1435 __u64 flags, struct lov_user_md *lum,
1438 struct lov_stripe_md *lsm = NULL;
1439 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1443 lsm = ccc_inode_lsm_get(inode);
1445 ccc_inode_lsm_put(inode, lsm);
1446 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1447 PFID(ll_inode2fid(inode)));
1448 GOTO(out, rc = -EEXIST);
1451 ll_inode_size_lock(inode);
1452 oit.it_flags |= MDS_OPEN_BY_FID;
1453 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1455 GOTO(out_unlock, rc);
1456 rc = oit.d.lustre.it_status;
1458 GOTO(out_req_free, rc);
1460 ll_release_openhandle(file->f_dentry, &oit);
1463 ll_inode_size_unlock(inode);
1464 ll_intent_release(&oit);
1465 ccc_inode_lsm_put(inode, lsm);
1467 cl_lov_delay_create_clear(&file->f_flags);
1470 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1474 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1475 struct lov_mds_md **lmmp, int *lmm_size,
1476 struct ptlrpc_request **request)
1478 struct ll_sb_info *sbi = ll_i2sbi(inode);
1479 struct mdt_body *body;
1480 struct lov_mds_md *lmm = NULL;
1481 struct ptlrpc_request *req = NULL;
1482 struct md_op_data *op_data;
1485 rc = ll_get_default_mdsize(sbi, &lmmsize);
1489 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1490 strlen(filename), lmmsize,
1491 LUSTRE_OPC_ANY, NULL);
1492 if (IS_ERR(op_data))
1493 RETURN(PTR_ERR(op_data));
1495 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1496 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1497 ll_finish_md_op_data(op_data);
1499 CDEBUG(D_INFO, "md_getattr_name failed "
1500 "on %s: rc %d\n", filename, rc);
1504 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1505 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1507 lmmsize = body->mbo_eadatasize;
1509 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1511 GOTO(out, rc = -ENODATA);
1514 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1515 LASSERT(lmm != NULL);
1517 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1518 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1519 GOTO(out, rc = -EPROTO);
1523 * This is coming from the MDS, so is probably in
1524 * little endian. We convert it to host endian before
1525 * passing it to userspace.
1527 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1530 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1531 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1534 /* if function called for directory - we should
1535 * avoid swab not existent lsm objects */
1536 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1537 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1538 if (S_ISREG(body->mbo_mode))
1539 lustre_swab_lov_user_md_objects(
1540 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1542 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1543 lustre_swab_lov_user_md_v3(
1544 (struct lov_user_md_v3 *)lmm);
1545 if (S_ISREG(body->mbo_mode))
1546 lustre_swab_lov_user_md_objects(
1547 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1554 *lmm_size = lmmsize;
1559 static int ll_lov_setea(struct inode *inode, struct file *file,
1562 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1563 struct lov_user_md *lump;
1564 int lum_size = sizeof(struct lov_user_md) +
1565 sizeof(struct lov_user_ost_data);
1569 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1572 OBD_ALLOC_LARGE(lump, lum_size);
1576 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1577 OBD_FREE_LARGE(lump, lum_size);
1581 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1583 OBD_FREE_LARGE(lump, lum_size);
1587 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1590 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1591 struct lov_user_md *klum;
1593 __u64 flags = FMODE_WRITE;
1596 rc = ll_copy_user_md(lum, &klum);
1601 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1603 struct lov_stripe_md *lsm;
1606 put_user(0, &lum->lmm_stripe_count);
1608 ll_layout_refresh(inode, &gen);
1609 lsm = ccc_inode_lsm_get(inode);
1610 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1612 ccc_inode_lsm_put(inode, lsm);
1615 OBD_FREE(klum, lum_size);
1619 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1621 struct lov_stripe_md *lsm;
1625 lsm = ccc_inode_lsm_get(inode);
1627 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1628 lsm, (void __user *)arg);
1629 ccc_inode_lsm_put(inode, lsm);
1634 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1636 struct ll_inode_info *lli = ll_i2info(inode);
1637 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1638 struct ccc_grouplock grouplock;
1643 CWARN("group id for group lock must not be 0\n");
1647 if (ll_file_nolock(file))
1648 RETURN(-EOPNOTSUPP);
1650 spin_lock(&lli->lli_lock);
1651 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1652 CWARN("group lock already existed with gid %lu\n",
1653 fd->fd_grouplock.cg_gid);
1654 spin_unlock(&lli->lli_lock);
1657 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1658 spin_unlock(&lli->lli_lock);
1660 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1661 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1665 spin_lock(&lli->lli_lock);
1666 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1667 spin_unlock(&lli->lli_lock);
1668 CERROR("another thread just won the race\n");
1669 cl_put_grouplock(&grouplock);
1673 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1674 fd->fd_grouplock = grouplock;
1675 spin_unlock(&lli->lli_lock);
1677 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1681 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1683 struct ll_inode_info *lli = ll_i2info(inode);
1684 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1685 struct ccc_grouplock grouplock;
1688 spin_lock(&lli->lli_lock);
1689 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1690 spin_unlock(&lli->lli_lock);
1691 CWARN("no group lock held\n");
1694 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1696 if (fd->fd_grouplock.cg_gid != arg) {
1697 CWARN("group lock %lu doesn't match current id %lu\n",
1698 arg, fd->fd_grouplock.cg_gid);
1699 spin_unlock(&lli->lli_lock);
1703 grouplock = fd->fd_grouplock;
1704 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1705 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1706 spin_unlock(&lli->lli_lock);
1708 cl_put_grouplock(&grouplock);
1709 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1714 * Close inode open handle
1716 * \param dentry [in] dentry which contains the inode
1717 * \param it [in,out] intent which contains open info and result
1720 * \retval <0 failure
1722 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1724 struct inode *inode = dentry->d_inode;
1725 struct obd_client_handle *och;
1731 /* Root ? Do nothing. */
1732 if (dentry->d_inode->i_sb->s_root == dentry)
1735 /* No open handle to close? Move away */
1736 if (!it_disposition(it, DISP_OPEN_OPEN))
1739 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1741 OBD_ALLOC(och, sizeof(*och));
1743 GOTO(out, rc = -ENOMEM);
1745 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1747 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1750 /* this one is in place of ll_file_open */
1751 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1752 ptlrpc_req_finished(it->d.lustre.it_data);
1753 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1759 * Get size for inode for which FIEMAP mapping is requested.
1760 * Make the FIEMAP get_info call and returns the result.
1762 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1765 struct obd_export *exp = ll_i2dtexp(inode);
1766 struct lov_stripe_md *lsm = NULL;
1767 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1768 __u32 vallen = num_bytes;
1772 /* Checks for fiemap flags */
1773 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1774 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1778 /* Check for FIEMAP_FLAG_SYNC */
1779 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1780 rc = filemap_fdatawrite(inode->i_mapping);
1785 lsm = ccc_inode_lsm_get(inode);
1789 /* If the stripe_count > 1 and the application does not understand
1790 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1792 if (lsm->lsm_stripe_count > 1 &&
1793 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1794 GOTO(out, rc = -EOPNOTSUPP);
1796 fm_key.oa.o_oi = lsm->lsm_oi;
1797 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1799 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1800 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1801 /* If filesize is 0, then there would be no objects for mapping */
1802 if (fm_key.oa.o_size == 0) {
1803 fiemap->fm_mapped_extents = 0;
1807 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1809 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1812 CERROR("obd_get_info failed: rc = %d\n", rc);
1815 ccc_inode_lsm_put(inode, lsm);
1819 int ll_fid2path(struct inode *inode, void __user *arg)
1821 struct obd_export *exp = ll_i2mdexp(inode);
1822 const struct getinfo_fid2path __user *gfin = arg;
1824 struct getinfo_fid2path *gfout;
1830 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1831 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1834 /* Only need to get the buflen */
1835 if (get_user(pathlen, &gfin->gf_pathlen))
1838 if (pathlen > PATH_MAX)
1841 outsize = sizeof(*gfout) + pathlen;
1842 OBD_ALLOC(gfout, outsize);
1846 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1847 GOTO(gf_free, rc = -EFAULT);
1849 /* Call mdc_iocontrol */
1850 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1854 if (copy_to_user(arg, gfout, outsize))
1858 OBD_FREE(gfout, outsize);
1862 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1864 struct ll_user_fiemap *fiemap_s;
1865 size_t num_bytes, ret_bytes;
1866 unsigned int extent_count;
1869 /* Get the extent count so we can calculate the size of
1870 * required fiemap buffer */
1871 if (get_user(extent_count,
1872 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1876 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1878 num_bytes = sizeof(*fiemap_s) + (extent_count *
1879 sizeof(struct ll_fiemap_extent));
1881 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1882 if (fiemap_s == NULL)
1885 /* get the fiemap value */
1886 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1888 GOTO(error, rc = -EFAULT);
1890 /* If fm_extent_count is non-zero, read the first extent since
1891 * it is used to calculate end_offset and device from previous
1894 if (copy_from_user(&fiemap_s->fm_extents[0],
1895 (char __user *)arg + sizeof(*fiemap_s),
1896 sizeof(struct ll_fiemap_extent)))
1897 GOTO(error, rc = -EFAULT);
1900 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1904 ret_bytes = sizeof(struct ll_user_fiemap);
1906 if (extent_count != 0)
1907 ret_bytes += (fiemap_s->fm_mapped_extents *
1908 sizeof(struct ll_fiemap_extent));
1910 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1914 OBD_FREE_LARGE(fiemap_s, num_bytes);
1919 * Read the data_version for inode.
1921 * This value is computed using stripe object version on OST.
1922 * Version is computed using server side locking.
1924 * @param sync if do sync on the OST side;
1926 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1927 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1929 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1931 struct lov_stripe_md *lsm = NULL;
1932 struct ll_sb_info *sbi = ll_i2sbi(inode);
1933 struct obdo *obdo = NULL;
1937 /* If no stripe, we consider version is 0. */
1938 lsm = ccc_inode_lsm_get(inode);
1939 if (!lsm_has_objects(lsm)) {
1941 CDEBUG(D_INODE, "No object for inode\n");
1945 OBD_ALLOC_PTR(obdo);
1947 GOTO(out, rc = -ENOMEM);
1949 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
1951 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1954 *data_version = obdo->o_data_version;
1960 ccc_inode_lsm_put(inode, lsm);
1965 * Trigger a HSM release request for the provided inode.
1967 int ll_hsm_release(struct inode *inode)
1969 struct cl_env_nest nest;
1971 struct obd_client_handle *och = NULL;
1972 __u64 data_version = 0;
1976 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1977 ll_get_fsname(inode->i_sb, NULL, 0),
1978 PFID(&ll_i2info(inode)->lli_fid));
1980 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1982 GOTO(out, rc = PTR_ERR(och));
1984 /* Grab latest data_version and [am]time values */
1985 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1989 env = cl_env_nested_get(&nest);
1991 GOTO(out, rc = PTR_ERR(env));
1993 ll_merge_lvb(env, inode);
1994 cl_env_nested_put(&nest, env);
1996 /* Release the file.
1997 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1998 * we still need it to pack l_remote_handle to MDT. */
1999 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2005 if (och != NULL && !IS_ERR(och)) /* close the file */
2006 ll_lease_close(och, inode, NULL);
2011 struct ll_swap_stack {
2012 struct iattr ia1, ia2;
2014 struct inode *inode1, *inode2;
2015 bool check_dv1, check_dv2;
2018 static int ll_swap_layouts(struct file *file1, struct file *file2,
2019 struct lustre_swap_layouts *lsl)
2021 struct mdc_swap_layouts msl;
2022 struct md_op_data *op_data;
2025 struct ll_swap_stack *llss = NULL;
2028 OBD_ALLOC_PTR(llss);
2032 llss->inode1 = file1->f_dentry->d_inode;
2033 llss->inode2 = file2->f_dentry->d_inode;
2035 if (!S_ISREG(llss->inode2->i_mode))
2036 GOTO(free, rc = -EINVAL);
2038 if (inode_permission(llss->inode1, MAY_WRITE) ||
2039 inode_permission(llss->inode2, MAY_WRITE))
2040 GOTO(free, rc = -EPERM);
2042 if (llss->inode2->i_sb != llss->inode1->i_sb)
2043 GOTO(free, rc = -EXDEV);
2045 /* we use 2 bool because it is easier to swap than 2 bits */
2046 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2047 llss->check_dv1 = true;
2049 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2050 llss->check_dv2 = true;
2052 /* we cannot use lsl->sl_dvX directly because we may swap them */
2053 llss->dv1 = lsl->sl_dv1;
2054 llss->dv2 = lsl->sl_dv2;
2056 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2057 if (rc == 0) /* same file, done! */
2060 if (rc < 0) { /* sequentialize it */
2061 swap(llss->inode1, llss->inode2);
2063 swap(llss->dv1, llss->dv2);
2064 swap(llss->check_dv1, llss->check_dv2);
2068 if (gid != 0) { /* application asks to flush dirty cache */
2069 rc = ll_get_grouplock(llss->inode1, file1, gid);
2073 rc = ll_get_grouplock(llss->inode2, file2, gid);
2075 ll_put_grouplock(llss->inode1, file1, gid);
2080 /* to be able to restore mtime and atime after swap
2081 * we need to first save them */
2083 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2084 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2085 llss->ia1.ia_atime = llss->inode1->i_atime;
2086 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2087 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2088 llss->ia2.ia_atime = llss->inode2->i_atime;
2089 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2092 /* ultimate check, before swaping the layouts we check if
2093 * dataversion has changed (if requested) */
2094 if (llss->check_dv1) {
2095 rc = ll_data_version(llss->inode1, &dv, 0);
2098 if (dv != llss->dv1)
2099 GOTO(putgl, rc = -EAGAIN);
2102 if (llss->check_dv2) {
2103 rc = ll_data_version(llss->inode2, &dv, 0);
2106 if (dv != llss->dv2)
2107 GOTO(putgl, rc = -EAGAIN);
2110 /* struct md_op_data is used to send the swap args to the mdt
2111 * only flags is missing, so we use struct mdc_swap_layouts
2112 * through the md_op_data->op_data */
2113 /* flags from user space have to be converted before they are send to
2114 * server, no flag is sent today, they are only used on the client */
2117 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2118 0, LUSTRE_OPC_ANY, &msl);
2119 if (IS_ERR(op_data))
2120 GOTO(free, rc = PTR_ERR(op_data));
2122 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2123 sizeof(*op_data), op_data, NULL);
2124 ll_finish_md_op_data(op_data);
2128 ll_put_grouplock(llss->inode2, file2, gid);
2129 ll_put_grouplock(llss->inode1, file1, gid);
2132 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2136 /* clear useless flags */
2137 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2138 llss->ia1.ia_valid &= ~ATTR_MTIME;
2139 llss->ia2.ia_valid &= ~ATTR_MTIME;
2142 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2143 llss->ia1.ia_valid &= ~ATTR_ATIME;
2144 llss->ia2.ia_valid &= ~ATTR_ATIME;
2147 /* update time if requested */
2149 if (llss->ia2.ia_valid != 0) {
2150 mutex_lock(&llss->inode1->i_mutex);
2151 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2152 mutex_unlock(&llss->inode1->i_mutex);
2155 if (llss->ia1.ia_valid != 0) {
2158 mutex_lock(&llss->inode2->i_mutex);
2159 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2160 mutex_unlock(&llss->inode2->i_mutex);
2172 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2174 struct md_op_data *op_data;
2177 /* Non-root users are forbidden to set or clear flags which are
2178 * NOT defined in HSM_USER_MASK. */
2179 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2180 !cfs_capable(CFS_CAP_SYS_ADMIN))
2183 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2184 LUSTRE_OPC_ANY, hss);
2185 if (IS_ERR(op_data))
2186 RETURN(PTR_ERR(op_data));
2188 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2189 sizeof(*op_data), op_data, NULL);
2191 ll_finish_md_op_data(op_data);
2196 static int ll_hsm_import(struct inode *inode, struct file *file,
2197 struct hsm_user_import *hui)
2199 struct hsm_state_set *hss = NULL;
2200 struct iattr *attr = NULL;
2204 if (!S_ISREG(inode->i_mode))
2210 GOTO(out, rc = -ENOMEM);
2212 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2213 hss->hss_archive_id = hui->hui_archive_id;
2214 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2215 rc = ll_hsm_state_set(inode, hss);
2219 OBD_ALLOC_PTR(attr);
2221 GOTO(out, rc = -ENOMEM);
2223 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2224 attr->ia_mode |= S_IFREG;
2225 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2226 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2227 attr->ia_size = hui->hui_size;
2228 attr->ia_mtime.tv_sec = hui->hui_mtime;
2229 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2230 attr->ia_atime.tv_sec = hui->hui_atime;
2231 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2233 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2234 ATTR_UID | ATTR_GID |
2235 ATTR_MTIME | ATTR_MTIME_SET |
2236 ATTR_ATIME | ATTR_ATIME_SET;
2238 mutex_lock(&inode->i_mutex);
2240 rc = ll_setattr_raw(file->f_dentry, attr, true);
2244 mutex_unlock(&inode->i_mutex);
2256 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2258 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2259 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2263 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2265 struct inode *inode = file->f_dentry->d_inode;
2266 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2270 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2271 PFID(ll_inode2fid(inode)), inode, cmd);
2272 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2274 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2275 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2279 case LL_IOC_GETFLAGS:
2280 /* Get the current value of the file flags */
2281 return put_user(fd->fd_flags, (int __user *)arg);
2282 case LL_IOC_SETFLAGS:
2283 case LL_IOC_CLRFLAGS:
2284 /* Set or clear specific file flags */
2285 /* XXX This probably needs checks to ensure the flags are
2286 * not abused, and to handle any flag side effects.
2288 if (get_user(flags, (int __user *) arg))
2291 if (cmd == LL_IOC_SETFLAGS) {
2292 if ((flags & LL_FILE_IGNORE_LOCK) &&
2293 !(file->f_flags & O_DIRECT)) {
2294 CERROR("%s: unable to disable locking on "
2295 "non-O_DIRECT file\n", current->comm);
2299 fd->fd_flags |= flags;
2301 fd->fd_flags &= ~flags;
2304 case LL_IOC_LOV_SETSTRIPE:
2305 RETURN(ll_lov_setstripe(inode, file, arg));
2306 case LL_IOC_LOV_SETEA:
2307 RETURN(ll_lov_setea(inode, file, arg));
2308 case LL_IOC_LOV_SWAP_LAYOUTS: {
2310 struct lustre_swap_layouts lsl;
2312 if (copy_from_user(&lsl, (char __user *)arg,
2313 sizeof(struct lustre_swap_layouts)))
2316 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2319 file2 = fget(lsl.sl_fd);
2324 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2325 rc = ll_swap_layouts(file, file2, &lsl);
2329 case LL_IOC_LOV_GETSTRIPE:
2330 RETURN(ll_lov_getstripe(inode, arg));
2331 case FSFILT_IOC_FIEMAP:
2332 RETURN(ll_ioctl_fiemap(inode, arg));
2333 case FSFILT_IOC_GETFLAGS:
2334 case FSFILT_IOC_SETFLAGS:
2335 RETURN(ll_iocontrol(inode, file, cmd, arg));
2336 case FSFILT_IOC_GETVERSION_OLD:
2337 case FSFILT_IOC_GETVERSION:
2338 RETURN(put_user(inode->i_generation, (int __user *)arg));
2339 case LL_IOC_GROUP_LOCK:
2340 RETURN(ll_get_grouplock(inode, file, arg));
2341 case LL_IOC_GROUP_UNLOCK:
2342 RETURN(ll_put_grouplock(inode, file, arg));
2343 case IOC_OBD_STATFS:
2344 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2346 /* We need to special case any other ioctls we want to handle,
2347 * to send them to the MDS/OST as appropriate and to properly
2348 * network encode the arg field.
2349 case FSFILT_IOC_SETVERSION_OLD:
2350 case FSFILT_IOC_SETVERSION:
2352 case LL_IOC_FLUSHCTX:
2353 RETURN(ll_flush_ctx(inode));
2354 case LL_IOC_PATH2FID: {
2355 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2356 sizeof(struct lu_fid)))
2361 case LL_IOC_GETPARENT:
2362 RETURN(ll_getparent(file, (void __user *)arg));
2364 case OBD_IOC_FID2PATH:
2365 RETURN(ll_fid2path(inode, (void __user *)arg));
2366 case LL_IOC_DATA_VERSION: {
2367 struct ioc_data_version idv;
2370 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2373 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2374 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2377 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2383 case LL_IOC_GET_MDTIDX: {
2386 mdtidx = ll_get_mdt_idx(inode);
2390 if (put_user((int)mdtidx, (int __user *)arg))
2395 case OBD_IOC_GETDTNAME:
2396 case OBD_IOC_GETMDNAME:
2397 RETURN(ll_get_obd_name(inode, cmd, arg));
2398 case LL_IOC_HSM_STATE_GET: {
2399 struct md_op_data *op_data;
2400 struct hsm_user_state *hus;
2407 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2408 LUSTRE_OPC_ANY, hus);
2409 if (IS_ERR(op_data)) {
2411 RETURN(PTR_ERR(op_data));
2414 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2417 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2420 ll_finish_md_op_data(op_data);
2424 case LL_IOC_HSM_STATE_SET: {
2425 struct hsm_state_set *hss;
2432 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2437 rc = ll_hsm_state_set(inode, hss);
2442 case LL_IOC_HSM_ACTION: {
2443 struct md_op_data *op_data;
2444 struct hsm_current_action *hca;
2451 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2452 LUSTRE_OPC_ANY, hca);
2453 if (IS_ERR(op_data)) {
2455 RETURN(PTR_ERR(op_data));
2458 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2461 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2464 ll_finish_md_op_data(op_data);
2468 case LL_IOC_SET_LEASE: {
2469 struct ll_inode_info *lli = ll_i2info(inode);
2470 struct obd_client_handle *och = NULL;
2475 case LL_LEASE_WRLCK:
2476 if (!(file->f_mode & FMODE_WRITE))
2478 fmode = FMODE_WRITE;
2480 case LL_LEASE_RDLCK:
2481 if (!(file->f_mode & FMODE_READ))
2485 case LL_LEASE_UNLCK:
2486 mutex_lock(&lli->lli_och_mutex);
2487 if (fd->fd_lease_och != NULL) {
2488 och = fd->fd_lease_och;
2489 fd->fd_lease_och = NULL;
2491 mutex_unlock(&lli->lli_och_mutex);
2496 fmode = och->och_flags;
2497 rc = ll_lease_close(och, inode, &lease_broken);
2504 RETURN(ll_lease_type_from_fmode(fmode));
2509 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2511 /* apply for lease */
2512 och = ll_lease_open(inode, file, fmode, 0);
2514 RETURN(PTR_ERR(och));
2517 mutex_lock(&lli->lli_och_mutex);
2518 if (fd->fd_lease_och == NULL) {
2519 fd->fd_lease_och = och;
2522 mutex_unlock(&lli->lli_och_mutex);
2524 /* impossible now that only excl is supported for now */
2525 ll_lease_close(och, inode, &lease_broken);
2530 case LL_IOC_GET_LEASE: {
2531 struct ll_inode_info *lli = ll_i2info(inode);
2532 struct ldlm_lock *lock = NULL;
2535 mutex_lock(&lli->lli_och_mutex);
2536 if (fd->fd_lease_och != NULL) {
2537 struct obd_client_handle *och = fd->fd_lease_och;
2539 lock = ldlm_handle2lock(&och->och_lease_handle);
2541 lock_res_and_lock(lock);
2542 if (!ldlm_is_cancel(lock))
2543 fmode = och->och_flags;
2545 unlock_res_and_lock(lock);
2546 LDLM_LOCK_PUT(lock);
2549 mutex_unlock(&lli->lli_och_mutex);
2551 RETURN(ll_lease_type_from_fmode(fmode));
2553 case LL_IOC_HSM_IMPORT: {
2554 struct hsm_user_import *hui;
2560 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2565 rc = ll_hsm_import(inode, file, hui);
2575 ll_iocontrol_call(inode, file, cmd, arg, &err))
2578 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2579 (void __user *)arg));
2584 #ifndef HAVE_FILE_LLSEEK_SIZE
2585 static inline loff_t
2586 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2588 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2590 if (offset > maxsize)
2593 if (offset != file->f_pos) {
2594 file->f_pos = offset;
2595 file->f_version = 0;
2601 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2602 loff_t maxsize, loff_t eof)
2604 struct inode *inode = file->f_dentry->d_inode;
2612 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2613 * position-querying operation. Avoid rewriting the "same"
2614 * f_pos value back to the file because a concurrent read(),
2615 * write() or lseek() might have altered it
2620 * f_lock protects against read/modify/write race with other
2621 * SEEK_CURs. Note that parallel writes and reads behave
2624 mutex_lock(&inode->i_mutex);
2625 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2626 mutex_unlock(&inode->i_mutex);
2630 * In the generic case the entire file is data, so as long as
2631 * offset isn't at the end of the file then the offset is data.
2638 * There is a virtual hole at the end of the file, so as long as
2639 * offset isn't i_size or larger, return i_size.
2647 return llseek_execute(file, offset, maxsize);
2651 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2653 struct inode *inode = file->f_dentry->d_inode;
2654 loff_t retval, eof = 0;
2657 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2658 (origin == SEEK_CUR) ? file->f_pos : 0);
2659 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2660 PFID(ll_inode2fid(inode)), inode, retval, retval,
2662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2664 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2665 retval = ll_glimpse_size(inode);
2668 eof = i_size_read(inode);
2671 retval = ll_generic_file_llseek_size(file, offset, origin,
2672 ll_file_maxbytes(inode), eof);
2676 static int ll_flush(struct file *file, fl_owner_t id)
2678 struct inode *inode = file->f_dentry->d_inode;
2679 struct ll_inode_info *lli = ll_i2info(inode);
2680 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2683 LASSERT(!S_ISDIR(inode->i_mode));
2685 /* catch async errors that were recorded back when async writeback
2686 * failed for pages in this mapping. */
2687 rc = lli->lli_async_rc;
2688 lli->lli_async_rc = 0;
2689 if (lli->lli_clob != NULL) {
2690 err = lov_read_and_clear_async_rc(lli->lli_clob);
2695 /* The application has been told write failure already.
2696 * Do not report failure again. */
2697 if (fd->fd_write_failed)
2699 return rc ? -EIO : 0;
2703 * Called to make sure a portion of file has been written out.
2704 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2706 * Return how many pages have been written.
2708 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2709 enum cl_fsync_mode mode, int ignore_layout)
2711 struct cl_env_nest nest;
2714 struct obd_capa *capa = NULL;
2715 struct cl_fsync_io *fio;
2719 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2720 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2723 env = cl_env_nested_get(&nest);
2725 RETURN(PTR_ERR(env));
2727 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2729 io = ccc_env_thread_io(env);
2730 io->ci_obj = cl_i2info(inode)->lli_clob;
2731 io->ci_ignore_layout = ignore_layout;
2733 /* initialize parameters for sync */
2734 fio = &io->u.ci_fsync;
2735 fio->fi_capa = capa;
2736 fio->fi_start = start;
2738 fio->fi_fid = ll_inode2fid(inode);
2739 fio->fi_mode = mode;
2740 fio->fi_nr_written = 0;
2742 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2743 result = cl_io_loop(env, io);
2745 result = io->ci_result;
2747 result = fio->fi_nr_written;
2748 cl_io_fini(env, io);
2749 cl_env_nested_put(&nest, env);
2757 * When dentry is provided (the 'else' case), *file->f_dentry may be
2758 * null and dentry must be used directly rather than pulled from
2759 * *file->f_dentry as is done otherwise.
2762 #ifdef HAVE_FILE_FSYNC_4ARGS
2763 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2765 struct dentry *dentry = file->f_dentry;
2766 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2767 int ll_fsync(struct file *file, int datasync)
2769 struct dentry *dentry = file->f_dentry;
2771 loff_t end = LLONG_MAX;
2773 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2776 loff_t end = LLONG_MAX;
2778 struct inode *inode = dentry->d_inode;
2779 struct ll_inode_info *lli = ll_i2info(inode);
2780 struct ptlrpc_request *req;
2781 struct obd_capa *oc;
2785 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2786 PFID(ll_inode2fid(inode)), inode);
2787 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2789 #ifdef HAVE_FILE_FSYNC_4ARGS
2790 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2791 mutex_lock(&inode->i_mutex);
2793 /* fsync's caller has already called _fdata{sync,write}, we want
2794 * that IO to finish before calling the osc and mdc sync methods */
2795 rc = filemap_fdatawait(inode->i_mapping);
2798 /* catch async errors that were recorded back when async writeback
2799 * failed for pages in this mapping. */
2800 if (!S_ISDIR(inode->i_mode)) {
2801 err = lli->lli_async_rc;
2802 lli->lli_async_rc = 0;
2805 err = lov_read_and_clear_async_rc(lli->lli_clob);
2810 oc = ll_mdscapa_get(inode);
2811 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2817 ptlrpc_req_finished(req);
2819 if (S_ISREG(inode->i_mode)) {
2820 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2822 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2823 if (rc == 0 && err < 0)
2826 fd->fd_write_failed = true;
2828 fd->fd_write_failed = false;
2831 #ifdef HAVE_FILE_FSYNC_4ARGS
2832 mutex_unlock(&inode->i_mutex);
2838 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2840 struct inode *inode = file->f_dentry->d_inode;
2841 struct ll_sb_info *sbi = ll_i2sbi(inode);
2842 struct ldlm_enqueue_info einfo = {
2843 .ei_type = LDLM_FLOCK,
2844 .ei_cb_cp = ldlm_flock_completion_ast,
2845 .ei_cbdata = file_lock,
2847 struct md_op_data *op_data;
2848 struct lustre_handle lockh = {0};
2849 ldlm_policy_data_t flock = {{0}};
2850 int fl_type = file_lock->fl_type;
2856 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2857 PFID(ll_inode2fid(inode)), file_lock);
2859 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2861 if (file_lock->fl_flags & FL_FLOCK) {
2862 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2863 /* flocks are whole-file locks */
2864 flock.l_flock.end = OFFSET_MAX;
2865 /* For flocks owner is determined by the local file desctiptor*/
2866 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2867 } else if (file_lock->fl_flags & FL_POSIX) {
2868 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2869 flock.l_flock.start = file_lock->fl_start;
2870 flock.l_flock.end = file_lock->fl_end;
2874 flock.l_flock.pid = file_lock->fl_pid;
2876 /* Somewhat ugly workaround for svc lockd.
2877 * lockd installs custom fl_lmops->lm_compare_owner that checks
2878 * for the fl_owner to be the same (which it always is on local node
2879 * I guess between lockd processes) and then compares pid.
2880 * As such we assign pid to the owner field to make it all work,
2881 * conflict with normal locks is unlikely since pid space and
2882 * pointer space for current->files are not intersecting */
2883 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2884 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2888 einfo.ei_mode = LCK_PR;
2891 /* An unlock request may or may not have any relation to
2892 * existing locks so we may not be able to pass a lock handle
2893 * via a normal ldlm_lock_cancel() request. The request may even
2894 * unlock a byte range in the middle of an existing lock. In
2895 * order to process an unlock request we need all of the same
2896 * information that is given with a normal read or write record
2897 * lock request. To avoid creating another ldlm unlock (cancel)
2898 * message we'll treat a LCK_NL flock request as an unlock. */
2899 einfo.ei_mode = LCK_NL;
2902 einfo.ei_mode = LCK_PW;
2905 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2920 flags = LDLM_FL_BLOCK_NOWAIT;
2926 flags = LDLM_FL_TEST_LOCK;
2929 CERROR("unknown fcntl lock command: %d\n", cmd);
2933 /* Save the old mode so that if the mode in the lock changes we
2934 * can decrement the appropriate reader or writer refcount. */
2935 file_lock->fl_type = einfo.ei_mode;
2937 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2938 LUSTRE_OPC_ANY, NULL);
2939 if (IS_ERR(op_data))
2940 RETURN(PTR_ERR(op_data));
2942 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2943 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2944 flock.l_flock.pid, flags, einfo.ei_mode,
2945 flock.l_flock.start, flock.l_flock.end);
2947 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2950 /* Restore the file lock type if not TEST lock. */
2951 if (!(flags & LDLM_FL_TEST_LOCK))
2952 file_lock->fl_type = fl_type;
2954 if ((file_lock->fl_flags & FL_FLOCK) &&
2955 (rc == 0 || file_lock->fl_type == F_UNLCK))
2956 rc2 = flock_lock_file_wait(file, file_lock);
2957 if ((file_lock->fl_flags & FL_POSIX) &&
2958 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2959 !(flags & LDLM_FL_TEST_LOCK))
2960 rc2 = posix_lock_file_wait(file, file_lock);
2962 if (rc2 && file_lock->fl_type != F_UNLCK) {
2963 einfo.ei_mode = LCK_NL;
2964 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2969 ll_finish_md_op_data(op_data);
2974 int ll_get_fid_by_name(struct inode *parent, const char *name,
2975 int namelen, struct lu_fid *fid)
2977 struct md_op_data *op_data = NULL;
2978 struct mdt_body *body;
2979 struct ptlrpc_request *req;
2983 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2984 LUSTRE_OPC_ANY, NULL);
2985 if (IS_ERR(op_data))
2986 RETURN(PTR_ERR(op_data));
2988 op_data->op_valid = OBD_MD_FLID;
2989 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2990 ll_finish_md_op_data(op_data);
2994 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2996 GOTO(out_req, rc = -EFAULT);
2998 *fid = body->mbo_fid1;
3000 ptlrpc_req_finished(req);
3004 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3005 const char *name, int namelen)
3007 struct dentry *dchild = NULL;
3008 struct inode *child_inode = NULL;
3009 struct md_op_data *op_data;
3010 struct ptlrpc_request *request = NULL;
3015 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3016 name, PFID(ll_inode2fid(parent)), mdtidx);
3018 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3019 0, LUSTRE_OPC_ANY, NULL);
3020 if (IS_ERR(op_data))
3021 RETURN(PTR_ERR(op_data));
3023 /* Get child FID first */
3024 qstr.hash = full_name_hash(name, namelen);
3027 dchild = d_lookup(file->f_dentry, &qstr);
3028 if (dchild != NULL && dchild->d_inode != NULL) {
3029 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3030 if (dchild->d_inode != NULL) {
3031 child_inode = igrab(dchild->d_inode);
3032 ll_invalidate_aliases(child_inode);
3036 rc = ll_get_fid_by_name(parent, name, namelen,
3042 if (!fid_is_sane(&op_data->op_fid3)) {
3043 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3044 ll_get_fsname(parent->i_sb, NULL, 0), name,
3045 PFID(&op_data->op_fid3));
3046 GOTO(out_free, rc = -EINVAL);
3049 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3054 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3055 PFID(&op_data->op_fid3), mdtidx);
3056 GOTO(out_free, rc = 0);
3059 op_data->op_mds = mdtidx;
3060 op_data->op_cli_flags = CLI_MIGRATE;
3061 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3062 namelen, name, namelen, &request);
3064 ll_update_times(request, parent);
3066 ptlrpc_req_finished(request);
3071 if (child_inode != NULL) {
3072 clear_nlink(child_inode);
3076 ll_finish_md_op_data(op_data);
3081 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3089 * test if some locks matching bits and l_req_mode are acquired
3090 * - bits can be in different locks
3091 * - if found clear the common lock bits in *bits
3092 * - the bits not found, are kept in *bits
3094 * \param bits [IN] searched lock bits [IN]
3095 * \param l_req_mode [IN] searched lock mode
3096 * \retval boolean, true iff all bits are found
3098 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3100 struct lustre_handle lockh;
3101 ldlm_policy_data_t policy;
3102 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3103 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3112 fid = &ll_i2info(inode)->lli_fid;
3113 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3114 ldlm_lockname[mode]);
3116 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3117 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3118 policy.l_inodebits.bits = *bits & (1 << i);
3119 if (policy.l_inodebits.bits == 0)
3122 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3123 &policy, mode, &lockh)) {
3124 struct ldlm_lock *lock;
3126 lock = ldlm_handle2lock(&lockh);
3129 ~(lock->l_policy_data.l_inodebits.bits);
3130 LDLM_LOCK_PUT(lock);
3132 *bits &= ~policy.l_inodebits.bits;
3139 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3140 struct lustre_handle *lockh, __u64 flags,
3143 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3148 fid = &ll_i2info(inode)->lli_fid;
3149 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3151 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3152 fid, LDLM_IBITS, &policy, mode, lockh);
3157 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3159 /* Already unlinked. Just update nlink and return success */
3160 if (rc == -ENOENT) {
3162 /* This path cannot be hit for regular files unless in
3163 * case of obscure races, so no need to to validate
3165 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3167 } else if (rc != 0) {
3168 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3169 "%s: revalidate FID "DFID" error: rc = %d\n",
3170 ll_get_fsname(inode->i_sb, NULL, 0),
3171 PFID(ll_inode2fid(inode)), rc);
3177 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3179 struct inode *inode = dentry->d_inode;
3180 struct ptlrpc_request *req = NULL;
3181 struct obd_export *exp;
3185 LASSERT(inode != NULL);
3187 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3188 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3190 exp = ll_i2mdexp(inode);
3192 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3193 * But under CMD case, it caused some lock issues, should be fixed
3194 * with new CMD ibits lock. See bug 12718 */
3195 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3196 struct lookup_intent oit = { .it_op = IT_GETATTR };
3197 struct md_op_data *op_data;
3199 if (ibits == MDS_INODELOCK_LOOKUP)
3200 oit.it_op = IT_LOOKUP;
3202 /* Call getattr by fid, so do not provide name at all. */
3203 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3204 dentry->d_inode, NULL, 0, 0,
3205 LUSTRE_OPC_ANY, NULL);
3206 if (IS_ERR(op_data))
3207 RETURN(PTR_ERR(op_data));
3209 rc = md_intent_lock(exp, op_data, &oit, &req,
3210 &ll_md_blocking_ast, 0);
3211 ll_finish_md_op_data(op_data);
3213 rc = ll_inode_revalidate_fini(inode, rc);
3217 rc = ll_revalidate_it_finish(req, &oit, dentry);
3219 ll_intent_release(&oit);
3223 /* Unlinked? Unhash dentry, so it is not picked up later by
3224 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3225 here to preserve get_cwd functionality on 2.6.
3227 if (!dentry->d_inode->i_nlink)
3228 d_lustre_invalidate(dentry, 0);
3230 ll_lookup_finish_locks(&oit, dentry);
3231 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3232 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3233 obd_valid valid = OBD_MD_FLGETATTR;
3234 struct md_op_data *op_data;
3237 if (S_ISREG(inode->i_mode)) {
3238 rc = ll_get_default_mdsize(sbi, &ealen);
3241 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3244 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3245 0, ealen, LUSTRE_OPC_ANY,
3247 if (IS_ERR(op_data))
3248 RETURN(PTR_ERR(op_data));
3250 op_data->op_valid = valid;
3251 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3252 * capa for this inode. Because we only keep capas of dirs
3254 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3255 ll_finish_md_op_data(op_data);
3257 rc = ll_inode_revalidate_fini(inode, rc);
3261 rc = ll_prep_inode(&inode, req, NULL, NULL);
3264 ptlrpc_req_finished(req);
3268 static int ll_merge_md_attr(struct inode *inode)
3270 struct cl_attr attr = { 0 };
3273 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3274 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3279 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3280 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3282 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3283 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3284 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3290 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3292 struct inode *inode = dentry->d_inode;
3296 rc = __ll_inode_revalidate(dentry, ibits);
3300 /* if object isn't regular file, don't validate size */
3301 if (!S_ISREG(inode->i_mode)) {
3302 if (S_ISDIR(inode->i_mode) &&
3303 ll_i2info(inode)->lli_lsm_md != NULL) {
3304 rc = ll_merge_md_attr(inode);
3309 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3310 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3311 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3313 /* In case of restore, the MDT has the right size and has
3314 * already send it back without granting the layout lock,
3315 * inode is up-to-date so glimpse is useless.
3316 * Also to glimpse we need the layout, in case of a running
3317 * restore the MDT holds the layout lock so the glimpse will
3318 * block up to the end of restore (getattr will block)
3320 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3321 rc = ll_glimpse_size(inode);
3326 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3328 struct inode *inode = de->d_inode;
3329 struct ll_sb_info *sbi = ll_i2sbi(inode);
3330 struct ll_inode_info *lli = ll_i2info(inode);
3333 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3334 MDS_INODELOCK_LOOKUP);
3335 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3340 stat->dev = inode->i_sb->s_dev;
3341 if (ll_need_32bit_api(sbi))
3342 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3344 stat->ino = inode->i_ino;
3345 stat->mode = inode->i_mode;
3346 stat->uid = inode->i_uid;
3347 stat->gid = inode->i_gid;
3348 stat->rdev = inode->i_rdev;
3349 stat->atime = inode->i_atime;
3350 stat->mtime = inode->i_mtime;
3351 stat->ctime = inode->i_ctime;
3352 stat->blksize = 1 << inode->i_blkbits;
3353 stat->blocks = inode->i_blocks;
3355 if (S_ISDIR(inode->i_mode) &&
3356 ll_i2info(inode)->lli_lsm_md != NULL) {
3357 stat->nlink = lli->lli_stripe_dir_nlink;
3358 stat->size = lli->lli_stripe_dir_size;
3360 stat->nlink = inode->i_nlink;
3361 stat->size = i_size_read(inode);
3367 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3368 __u64 start, __u64 len)
3372 struct ll_user_fiemap *fiemap;
3373 unsigned int extent_count = fieinfo->fi_extents_max;
3375 num_bytes = sizeof(*fiemap) + (extent_count *
3376 sizeof(struct ll_fiemap_extent));
3377 OBD_ALLOC_LARGE(fiemap, num_bytes);
3382 fiemap->fm_flags = fieinfo->fi_flags;
3383 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3384 fiemap->fm_start = start;
3385 fiemap->fm_length = len;
3386 if (extent_count > 0)
3387 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3388 sizeof(struct ll_fiemap_extent));
3390 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3392 fieinfo->fi_flags = fiemap->fm_flags;
3393 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3394 if (extent_count > 0)
3395 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3396 fiemap->fm_mapped_extents *
3397 sizeof(struct ll_fiemap_extent));
3399 OBD_FREE_LARGE(fiemap, num_bytes);
3403 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3405 struct ll_inode_info *lli = ll_i2info(inode);
3406 struct posix_acl *acl = NULL;
3409 spin_lock(&lli->lli_lock);
3410 /* VFS' acl_permission_check->check_acl will release the refcount */
3411 acl = posix_acl_dup(lli->lli_posix_acl);
3412 spin_unlock(&lli->lli_lock);
3417 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3419 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3420 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3422 ll_check_acl(struct inode *inode, int mask)
3425 # ifdef CONFIG_FS_POSIX_ACL
3426 struct posix_acl *acl;
3430 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3431 if (flags & IPERM_FLAG_RCU)
3434 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3439 rc = posix_acl_permission(inode, acl, mask);
3440 posix_acl_release(acl);
3443 # else /* !CONFIG_FS_POSIX_ACL */
3445 # endif /* CONFIG_FS_POSIX_ACL */
3447 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3449 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3450 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3452 # ifdef HAVE_INODE_PERMISION_2ARGS
3453 int ll_inode_permission(struct inode *inode, int mask)
3455 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3460 struct ll_sb_info *sbi;
3461 struct root_squash_info *squash;
3462 struct cred *cred = NULL;
3463 const struct cred *old_cred = NULL;
3465 bool squash_id = false;
3468 #ifdef MAY_NOT_BLOCK
3469 if (mask & MAY_NOT_BLOCK)
3471 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3472 if (flags & IPERM_FLAG_RCU)
3476 /* as root inode are NOT getting validated in lookup operation,
3477 * need to do it before permission check. */
3479 if (inode == inode->i_sb->s_root->d_inode) {
3480 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3481 MDS_INODELOCK_LOOKUP);
3486 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3487 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3489 /* squash fsuid/fsgid if needed */
3490 sbi = ll_i2sbi(inode);
3491 squash = &sbi->ll_squash;
3492 if (unlikely(squash->rsi_uid != 0 &&
3493 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3494 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3498 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3499 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3500 squash->rsi_uid, squash->rsi_gid);
3502 /* update current process's credentials
3503 * and FS capability */
3504 cred = prepare_creds();
3508 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3509 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3510 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3511 if ((1 << cap) & CFS_CAP_FS_MASK)
3512 cap_lower(cred->cap_effective, cap);
3514 old_cred = override_creds(cred);
3517 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3519 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3520 rc = lustre_check_remote_perm(inode, mask);
3522 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3524 /* restore current process's credentials and FS capability */
3526 revert_creds(old_cred);
3533 /* -o localflock - only provides locally consistent flock locks */
3534 struct file_operations ll_file_operations = {
3535 .read = ll_file_read,
3536 .aio_read = ll_file_aio_read,
3537 .write = ll_file_write,
3538 .aio_write = ll_file_aio_write,
3539 .unlocked_ioctl = ll_file_ioctl,
3540 .open = ll_file_open,
3541 .release = ll_file_release,
3542 .mmap = ll_file_mmap,
3543 .llseek = ll_file_seek,
3544 .splice_read = ll_file_splice_read,
3549 struct file_operations ll_file_operations_flock = {
3550 .read = ll_file_read,
3551 .aio_read = ll_file_aio_read,
3552 .write = ll_file_write,
3553 .aio_write = ll_file_aio_write,
3554 .unlocked_ioctl = ll_file_ioctl,
3555 .open = ll_file_open,
3556 .release = ll_file_release,
3557 .mmap = ll_file_mmap,
3558 .llseek = ll_file_seek,
3559 .splice_read = ll_file_splice_read,
3562 .flock = ll_file_flock,
3563 .lock = ll_file_flock
3566 /* These are for -o noflock - to return ENOSYS on flock calls */
3567 struct file_operations ll_file_operations_noflock = {
3568 .read = ll_file_read,
3569 .aio_read = ll_file_aio_read,
3570 .write = ll_file_write,
3571 .aio_write = ll_file_aio_write,
3572 .unlocked_ioctl = ll_file_ioctl,
3573 .open = ll_file_open,
3574 .release = ll_file_release,
3575 .mmap = ll_file_mmap,
3576 .llseek = ll_file_seek,
3577 .splice_read = ll_file_splice_read,
3580 .flock = ll_file_noflock,
3581 .lock = ll_file_noflock
3584 struct inode_operations ll_file_inode_operations = {
3585 .setattr = ll_setattr,
3586 .getattr = ll_getattr,
3587 .permission = ll_inode_permission,
3588 .setxattr = ll_setxattr,
3589 .getxattr = ll_getxattr,
3590 .listxattr = ll_listxattr,
3591 .removexattr = ll_removexattr,
3592 .fiemap = ll_fiemap,
3593 #ifdef HAVE_IOP_GET_ACL
3594 .get_acl = ll_get_acl,
3598 /* dynamic ioctl number support routins */
3599 static struct llioc_ctl_data {
3600 struct rw_semaphore ioc_sem;
3601 struct list_head ioc_head;
3603 __RWSEM_INITIALIZER(llioc.ioc_sem),
3604 LIST_HEAD_INIT(llioc.ioc_head)
3609 struct list_head iocd_list;
3610 unsigned int iocd_size;
3611 llioc_callback_t iocd_cb;
3612 unsigned int iocd_count;
3613 unsigned int iocd_cmd[0];
3616 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3619 struct llioc_data *in_data = NULL;
3622 if (cb == NULL || cmd == NULL ||
3623 count > LLIOC_MAX_CMD || count < 0)
3626 size = sizeof(*in_data) + count * sizeof(unsigned int);
3627 OBD_ALLOC(in_data, size);
3628 if (in_data == NULL)
3631 memset(in_data, 0, sizeof(*in_data));
3632 in_data->iocd_size = size;
3633 in_data->iocd_cb = cb;
3634 in_data->iocd_count = count;
3635 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3637 down_write(&llioc.ioc_sem);
3638 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3639 up_write(&llioc.ioc_sem);
3644 void ll_iocontrol_unregister(void *magic)
3646 struct llioc_data *tmp;
3651 down_write(&llioc.ioc_sem);
3652 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3654 unsigned int size = tmp->iocd_size;
3656 list_del(&tmp->iocd_list);
3657 up_write(&llioc.ioc_sem);
3659 OBD_FREE(tmp, size);
3663 up_write(&llioc.ioc_sem);
3665 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3668 EXPORT_SYMBOL(ll_iocontrol_register);
3669 EXPORT_SYMBOL(ll_iocontrol_unregister);
3671 static enum llioc_iter
3672 ll_iocontrol_call(struct inode *inode, struct file *file,
3673 unsigned int cmd, unsigned long arg, int *rcp)
3675 enum llioc_iter ret = LLIOC_CONT;
3676 struct llioc_data *data;
3677 int rc = -EINVAL, i;
3679 down_read(&llioc.ioc_sem);
3680 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3681 for (i = 0; i < data->iocd_count; i++) {
3682 if (cmd != data->iocd_cmd[i])
3685 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3689 if (ret == LLIOC_STOP)
3692 up_read(&llioc.ioc_sem);
3699 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3701 struct ll_inode_info *lli = ll_i2info(inode);
3702 struct cl_env_nest nest;
3707 if (lli->lli_clob == NULL)
3710 env = cl_env_nested_get(&nest);
3712 RETURN(PTR_ERR(env));
3714 result = cl_conf_set(env, lli->lli_clob, conf);
3715 cl_env_nested_put(&nest, env);
3717 if (conf->coc_opc == OBJECT_CONF_SET) {
3718 struct ldlm_lock *lock = conf->coc_lock;
3720 LASSERT(lock != NULL);
3721 LASSERT(ldlm_has_layout(lock));
3723 struct lustre_md *md = conf->u.coc_md;
3724 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3726 /* it can only be allowed to match after layout is
3727 * applied to inode otherwise false layout would be
3728 * seen. Applying layout shoud happen before dropping
3729 * the intent lock. */
3730 ldlm_lock_allow_match(lock);
3732 lli->lli_has_smd = lsm_has_objects(md->lsm);
3733 if (md->lsm != NULL)
3734 gen = md->lsm->lsm_layout_gen;
3737 DFID ": layout version change: %u -> %u\n",
3738 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3740 ll_layout_version_set(lli, gen);
3746 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3747 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3750 struct ll_sb_info *sbi = ll_i2sbi(inode);
3751 struct obd_capa *oc;
3752 struct ptlrpc_request *req;
3753 struct mdt_body *body;
3760 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3761 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3762 lock->l_lvb_data, lock->l_lvb_len);
3764 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3767 /* if layout lock was granted right away, the layout is returned
3768 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3769 * blocked and then granted via completion ast, we have to fetch
3770 * layout here. Please note that we can't use the LVB buffer in
3771 * completion AST because it doesn't have a large enough buffer */
3772 oc = ll_mdscapa_get(inode);
3773 rc = ll_get_default_mdsize(sbi, &lmmsize);
3775 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3776 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3782 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3784 GOTO(out, rc = -EPROTO);
3786 lmmsize = body->mbo_eadatasize;
3787 if (lmmsize == 0) /* empty layout */
3790 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3792 GOTO(out, rc = -EFAULT);
3794 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3795 if (lvbdata == NULL)
3796 GOTO(out, rc = -ENOMEM);
3798 memcpy(lvbdata, lmm, lmmsize);
3799 lock_res_and_lock(lock);
3800 if (lock->l_lvb_data != NULL)
3801 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3803 lock->l_lvb_data = lvbdata;
3804 lock->l_lvb_len = lmmsize;
3805 unlock_res_and_lock(lock);
3810 ptlrpc_req_finished(req);
3815 * Apply the layout to the inode. Layout lock is held and will be released
3818 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3819 struct inode *inode, __u32 *gen, bool reconf)
3821 struct ll_inode_info *lli = ll_i2info(inode);
3822 struct ll_sb_info *sbi = ll_i2sbi(inode);
3823 struct ldlm_lock *lock;
3824 struct lustre_md md = { NULL };
3825 struct cl_object_conf conf;
3828 bool wait_layout = false;
3831 LASSERT(lustre_handle_is_used(lockh));
3833 lock = ldlm_handle2lock(lockh);
3834 LASSERT(lock != NULL);
3835 LASSERT(ldlm_has_layout(lock));
3837 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3838 PFID(&lli->lli_fid), inode, reconf);
3840 /* in case this is a caching lock and reinstate with new inode */
3841 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3843 lock_res_and_lock(lock);
3844 lvb_ready = ldlm_is_lvb_ready(lock);
3845 unlock_res_and_lock(lock);
3846 /* checking lvb_ready is racy but this is okay. The worst case is
3847 * that multi processes may configure the file on the same time. */
3849 if (lvb_ready || !reconf) {
3852 /* layout_gen must be valid if layout lock is not
3853 * cancelled and stripe has already set */
3854 *gen = ll_layout_version_get(lli);
3860 rc = ll_layout_fetch(inode, lock);
3864 /* for layout lock, lmm is returned in lock's lvb.
3865 * lvb_data is immutable if the lock is held so it's safe to access it
3866 * without res lock. See the description in ldlm_lock_decref_internal()
3867 * for the condition to free lvb_data of layout lock */
3868 if (lock->l_lvb_data != NULL) {
3869 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3870 lock->l_lvb_data, lock->l_lvb_len);
3872 *gen = LL_LAYOUT_GEN_EMPTY;
3874 *gen = md.lsm->lsm_layout_gen;
3877 CERROR("%s: file "DFID" unpackmd error: %d\n",
3878 ll_get_fsname(inode->i_sb, NULL, 0),
3879 PFID(&lli->lli_fid), rc);
3885 /* set layout to file. Unlikely this will fail as old layout was
3886 * surely eliminated */
3887 memset(&conf, 0, sizeof conf);
3888 conf.coc_opc = OBJECT_CONF_SET;
3889 conf.coc_inode = inode;
3890 conf.coc_lock = lock;
3891 conf.u.coc_md = &md;
3892 rc = ll_layout_conf(inode, &conf);
3895 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3897 /* refresh layout failed, need to wait */
3898 wait_layout = rc == -EBUSY;
3902 LDLM_LOCK_PUT(lock);
3903 ldlm_lock_decref(lockh, mode);
3905 /* wait for IO to complete if it's still being used. */
3907 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3908 ll_get_fsname(inode->i_sb, NULL, 0),
3909 PFID(&lli->lli_fid), inode);
3911 memset(&conf, 0, sizeof conf);
3912 conf.coc_opc = OBJECT_CONF_WAIT;
3913 conf.coc_inode = inode;
3914 rc = ll_layout_conf(inode, &conf);
3918 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3919 ll_get_fsname(inode->i_sb, NULL, 0),
3920 PFID(&lli->lli_fid), rc);
3926 * This function checks if there exists a LAYOUT lock on the client side,
3927 * or enqueues it if it doesn't have one in cache.
3929 * This function will not hold layout lock so it may be revoked any time after
3930 * this function returns. Any operations depend on layout should be redone
3933 * This function should be called before lov_io_init() to get an uptodate
3934 * layout version, the caller should save the version number and after IO
3935 * is finished, this function should be called again to verify that layout
3936 * is not changed during IO time.
3938 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3940 struct ll_inode_info *lli = ll_i2info(inode);
3941 struct ll_sb_info *sbi = ll_i2sbi(inode);
3942 struct md_op_data *op_data;
3943 struct lookup_intent it;
3944 struct lustre_handle lockh;
3946 struct ldlm_enqueue_info einfo = {
3947 .ei_type = LDLM_IBITS,
3949 .ei_cb_bl = &ll_md_blocking_ast,
3950 .ei_cb_cp = &ldlm_completion_ast,
3955 *gen = ll_layout_version_get(lli);
3956 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3960 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3961 LASSERT(S_ISREG(inode->i_mode));
3963 /* take layout lock mutex to enqueue layout lock exclusively. */
3964 mutex_lock(&lli->lli_layout_mutex);
3967 /* mostly layout lock is caching on the local side, so try to match
3968 * it before grabbing layout lock mutex. */
3969 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3970 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3971 if (mode != 0) { /* hit cached lock */
3972 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3976 mutex_unlock(&lli->lli_layout_mutex);
3980 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3981 0, 0, LUSTRE_OPC_ANY, NULL);
3982 if (IS_ERR(op_data)) {
3983 mutex_unlock(&lli->lli_layout_mutex);
3984 RETURN(PTR_ERR(op_data));
3987 /* have to enqueue one */
3988 memset(&it, 0, sizeof(it));
3989 it.it_op = IT_LAYOUT;
3990 lockh.cookie = 0ULL;
3992 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3993 ll_get_fsname(inode->i_sb, NULL, 0),
3994 PFID(&lli->lli_fid), inode);
3996 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3997 if (it.d.lustre.it_data != NULL)
3998 ptlrpc_req_finished(it.d.lustre.it_data);
3999 it.d.lustre.it_data = NULL;
4001 ll_finish_md_op_data(op_data);
4003 mode = it.d.lustre.it_lock_mode;
4004 it.d.lustre.it_lock_mode = 0;
4005 ll_intent_drop_lock(&it);
4008 /* set lock data in case this is a new lock */
4009 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4010 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4014 mutex_unlock(&lli->lli_layout_mutex);
4020 * This function send a restore request to the MDT
4022 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4024 struct hsm_user_request *hur;
4028 len = sizeof(struct hsm_user_request) +
4029 sizeof(struct hsm_user_item);
4030 OBD_ALLOC(hur, len);
4034 hur->hur_request.hr_action = HUA_RESTORE;
4035 hur->hur_request.hr_archive_id = 0;
4036 hur->hur_request.hr_flags = 0;
4037 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4038 sizeof(hur->hur_user_item[0].hui_fid));
4039 hur->hur_user_item[0].hui_extent.offset = offset;
4040 hur->hur_user_item[0].hui_extent.length = length;
4041 hur->hur_request.hr_itemcount = 1;
4042 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,