4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
94 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
96 op_data->op_handle = *fh;
97 op_data->op_capa1 = ll_mdscapa_get(inode);
99 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
100 op_data->op_bias |= MDS_DATA_MODIFIED;
104 * Closes the IO epoch and packs all the attributes into @op_data for
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
120 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_ioepoch_close(inode, op_data, &och, 0);
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
131 static int ll_close_inode_openhandle(struct obd_export *md_exp,
133 struct obd_client_handle *och,
134 const __u64 *data_version)
136 struct obd_export *exp = ll_i2mdexp(inode);
137 struct md_op_data *op_data;
138 struct ptlrpc_request *req = NULL;
139 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 /* This close must have the epoch closed. */
170 LASSERT(epoch_close);
171 /* MDS has instructed us to obtain Size-on-MDS attribute from
172 * OSTs and send setattr to back to MDS. */
173 rc = ll_som_update(inode, op_data);
175 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
176 " failed: rc = %d\n",
177 ll_i2mdexp(inode)->exp_obd->obd_name,
178 PFID(ll_inode2fid(inode)), rc);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
187 /* DATA_MODIFIED flag was successfully sent on close, cancel data
188 * modification flag. */
189 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
190 struct ll_inode_info *lli = ll_i2info(inode);
192 spin_lock(&lli->lli_lock);
193 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
194 spin_unlock(&lli->lli_lock);
198 rc = ll_objects_destroy(req, inode);
200 CERROR("%s: inode "DFID
201 " ll_objects destroy: rc = %d\n",
202 ll_i2mdexp(inode)->exp_obd->obd_name,
203 PFID(ll_inode2fid(inode)), rc);
206 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
207 struct mdt_body *body;
208 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
209 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
213 ll_finish_md_op_data(op_data);
217 if (exp_connect_som(exp) && !epoch_close &&
218 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
219 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
221 md_clear_open_replay_data(md_exp, och);
222 /* Free @och if it is not waiting for DONE_WRITING. */
223 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
226 if (req) /* This is close request */
227 ptlrpc_req_finished(req);
231 int ll_md_real_close(struct inode *inode, fmode_t fmode)
233 struct ll_inode_info *lli = ll_i2info(inode);
234 struct obd_client_handle **och_p;
235 struct obd_client_handle *och;
240 if (fmode & FMODE_WRITE) {
241 och_p = &lli->lli_mds_write_och;
242 och_usecount = &lli->lli_open_fd_write_count;
243 } else if (fmode & FMODE_EXEC) {
244 och_p = &lli->lli_mds_exec_och;
245 och_usecount = &lli->lli_open_fd_exec_count;
247 LASSERT(fmode & FMODE_READ);
248 och_p = &lli->lli_mds_read_och;
249 och_usecount = &lli->lli_open_fd_read_count;
252 mutex_lock(&lli->lli_och_mutex);
253 if (*och_usecount > 0) {
254 /* There are still users of this handle, so skip
256 mutex_unlock(&lli->lli_och_mutex);
262 mutex_unlock(&lli->lli_och_mutex);
265 /* There might be a race and this handle may already
267 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
274 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
278 struct ll_inode_info *lli = ll_i2info(inode);
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("released file has negative dentry: file = %p, "
337 "dentry = %p, name = %s\n",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead,
385 * because parent and child process can share the same file handle. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
387 ll_deauthorize_statahead(inode, fd);
389 if (inode->i_sb->s_root == file->f_dentry) {
390 LUSTRE_FPRIVATE(file) = NULL;
391 ll_file_data_put(fd);
395 if (!S_ISDIR(inode->i_mode)) {
396 if (lli->lli_clob != NULL)
397 lov_read_and_clear_async_rc(lli->lli_clob);
398 lli->lli_async_rc = 0;
401 rc = ll_md_close(sbi->ll_md_exp, inode, file);
403 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
404 libcfs_debug_dumplog();
409 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
410 struct lookup_intent *itp)
412 struct dentry *de = file->f_dentry;
413 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
414 struct dentry *parent = de->d_parent;
415 const char *name = NULL;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req = NULL;
422 LASSERT(parent != NULL);
423 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
425 /* if server supports open-by-fid, or file name is invalid, don't pack
426 * name in open request */
427 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
428 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
429 name = de->d_name.name;
430 len = de->d_name.len;
433 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
434 name, len, 0, LUSTRE_OPC_ANY, NULL);
436 RETURN(PTR_ERR(op_data));
437 op_data->op_data = lmm;
438 op_data->op_data_size = lmmsize;
440 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
441 &ll_md_blocking_ast, 0);
442 ll_finish_md_op_data(op_data);
444 /* reason for keep own exit path - don`t flood log
445 * with messages with -ESTALE errors.
447 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
448 it_open_error(DISP_OPEN_OPEN, itp))
450 ll_release_openhandle(de, itp);
454 if (it_disposition(itp, DISP_LOOKUP_NEG))
455 GOTO(out, rc = -ENOENT);
457 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
458 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
459 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
463 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
464 if (!rc && itp->d.lustre.it_lock_mode)
465 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
468 ptlrpc_req_finished(req);
469 ll_intent_drop_lock(itp);
475 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
476 * not believe attributes if a few ioepoch holders exist. Attributes for
477 * previous ioepoch if new one is opened are also skipped by MDS.
479 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
481 if (ioepoch && lli->lli_ioepoch != ioepoch) {
482 lli->lli_ioepoch = ioepoch;
483 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
484 ioepoch, PFID(&lli->lli_fid));
488 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
489 struct obd_client_handle *och)
491 struct ptlrpc_request *req = it->d.lustre.it_data;
492 struct mdt_body *body;
494 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
495 och->och_fh = body->mbo_handle;
496 och->och_fid = body->mbo_fid1;
497 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
498 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
499 och->och_flags = it->it_flags;
501 return md_set_open_replay_data(md_exp, och, it);
504 static int ll_local_open(struct file *file, struct lookup_intent *it,
505 struct ll_file_data *fd, struct obd_client_handle *och)
507 struct inode *inode = file->f_dentry->d_inode;
508 struct ll_inode_info *lli = ll_i2info(inode);
511 LASSERT(!LUSTRE_FPRIVATE(file));
516 struct ptlrpc_request *req = it->d.lustre.it_data;
517 struct mdt_body *body;
520 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
524 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
525 ll_ioepoch_open(lli, body->mbo_ioepoch);
528 LUSTRE_FPRIVATE(file) = fd;
529 ll_readahead_init(inode, &fd->fd_ras);
530 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
532 /* ll_cl_context initialize */
533 rwlock_init(&fd->fd_lock);
534 INIT_LIST_HEAD(&fd->fd_lccs);
539 /* Open a file, and (for the very first open) create objects on the OSTs at
540 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
541 * creation or open until ll_lov_setstripe() ioctl is called.
543 * If we already have the stripe MD locally then we don't request it in
544 * md_open(), by passing a lmm_size = 0.
546 * It is up to the application to ensure no other processes open this file
547 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
548 * used. We might be able to avoid races of that sort by getting lli_open_sem
549 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
550 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
552 int ll_file_open(struct inode *inode, struct file *file)
554 struct ll_inode_info *lli = ll_i2info(inode);
555 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
556 .it_flags = file->f_flags };
557 struct obd_client_handle **och_p = NULL;
558 __u64 *och_usecount = NULL;
559 struct ll_file_data *fd;
563 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
564 PFID(ll_inode2fid(inode)), inode, file->f_flags);
566 it = file->private_data; /* XXX: compat macro */
567 file->private_data = NULL; /* prevent ll_local_open assertion */
569 fd = ll_file_data_get();
571 GOTO(out_openerr, rc = -ENOMEM);
574 if (S_ISDIR(inode->i_mode))
575 ll_authorize_statahead(inode, fd);
577 if (inode->i_sb->s_root == file->f_dentry) {
578 LUSTRE_FPRIVATE(file) = fd;
582 if (!it || !it->d.lustre.it_disposition) {
583 /* Convert f_flags into access mode. We cannot use file->f_mode,
584 * because everything but O_ACCMODE mask was stripped from
586 if ((oit.it_flags + 1) & O_ACCMODE)
588 if (file->f_flags & O_TRUNC)
589 oit.it_flags |= FMODE_WRITE;
591 /* kernel only call f_op->open in dentry_open. filp_open calls
592 * dentry_open after call to open_namei that checks permissions.
593 * Only nfsd_open call dentry_open directly without checking
594 * permissions and because of that this code below is safe. */
595 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
596 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
598 /* We do not want O_EXCL here, presumably we opened the file
599 * already? XXX - NFS implications? */
600 oit.it_flags &= ~O_EXCL;
602 /* bug20584, if "it_flags" contains O_CREAT, the file will be
603 * created if necessary, then "IT_CREAT" should be set to keep
604 * consistent with it */
605 if (oit.it_flags & O_CREAT)
606 oit.it_op |= IT_CREAT;
612 /* Let's see if we have file open on MDS already. */
613 if (it->it_flags & FMODE_WRITE) {
614 och_p = &lli->lli_mds_write_och;
615 och_usecount = &lli->lli_open_fd_write_count;
616 } else if (it->it_flags & FMODE_EXEC) {
617 och_p = &lli->lli_mds_exec_och;
618 och_usecount = &lli->lli_open_fd_exec_count;
620 och_p = &lli->lli_mds_read_och;
621 och_usecount = &lli->lli_open_fd_read_count;
624 mutex_lock(&lli->lli_och_mutex);
625 if (*och_p) { /* Open handle is present */
626 if (it_disposition(it, DISP_OPEN_OPEN)) {
627 /* Well, there's extra open request that we do not need,
628 let's close it somehow. This will decref request. */
629 rc = it_open_error(DISP_OPEN_OPEN, it);
631 mutex_unlock(&lli->lli_och_mutex);
632 GOTO(out_openerr, rc);
635 ll_release_openhandle(file->f_dentry, it);
639 rc = ll_local_open(file, it, fd, NULL);
642 mutex_unlock(&lli->lli_och_mutex);
643 GOTO(out_openerr, rc);
646 LASSERT(*och_usecount == 0);
647 if (!it->d.lustre.it_disposition) {
648 /* We cannot just request lock handle now, new ELC code
649 means that one of other OPEN locks for this file
650 could be cancelled, and since blocking ast handler
651 would attempt to grab och_mutex as well, that would
652 result in a deadlock */
653 mutex_unlock(&lli->lli_och_mutex);
655 * Normally called under two situations:
657 * 2. A race/condition on MDS resulting in no open
658 * handle to be returned from LOOKUP|OPEN request,
659 * for example if the target entry was a symlink.
661 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
663 * Always specify MDS_OPEN_BY_FID because we don't want
664 * to get file with different fid.
666 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
667 rc = ll_intent_file_open(file, NULL, 0, it);
669 GOTO(out_openerr, rc);
673 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
675 GOTO(out_och_free, rc = -ENOMEM);
679 /* md_intent_lock() didn't get a request ref if there was an
680 * open error, so don't do cleanup on the request here
682 /* XXX (green): Should not we bail out on any error here, not
683 * just open error? */
684 rc = it_open_error(DISP_OPEN_OPEN, it);
686 GOTO(out_och_free, rc);
688 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
689 "inode %p: disposition %x, status %d\n", inode,
690 it_disposition(it, ~0), it->d.lustre.it_status);
692 rc = ll_local_open(file, it, fd, *och_p);
694 GOTO(out_och_free, rc);
696 mutex_unlock(&lli->lli_och_mutex);
699 /* Must do this outside lli_och_mutex lock to prevent deadlock where
700 different kind of OPEN lock for this same inode gets cancelled
701 by ldlm_cancel_lru */
702 if (!S_ISREG(inode->i_mode))
703 GOTO(out_och_free, rc);
707 if (!lli->lli_has_smd &&
708 (cl_is_lov_delay_create(file->f_flags) ||
709 (file->f_mode & FMODE_WRITE) == 0)) {
710 CDEBUG(D_INODE, "object creation was delayed\n");
711 GOTO(out_och_free, rc);
713 cl_lov_delay_create_clear(&file->f_flags);
714 GOTO(out_och_free, rc);
718 if (och_p && *och_p) {
719 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
720 *och_p = NULL; /* OBD_FREE writes some magic there */
723 mutex_unlock(&lli->lli_och_mutex);
726 if (lli->lli_opendir_key == fd)
727 ll_deauthorize_statahead(inode, fd);
729 ll_file_data_put(fd);
731 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
734 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
735 ptlrpc_req_finished(it->d.lustre.it_data);
736 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
742 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
743 struct ldlm_lock_desc *desc, void *data, int flag)
746 struct lustre_handle lockh;
750 case LDLM_CB_BLOCKING:
751 ldlm_lock2handle(lock, &lockh);
752 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
754 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
758 case LDLM_CB_CANCELING:
766 * Acquire a lease and open the file.
768 static struct obd_client_handle *
769 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
772 struct lookup_intent it = { .it_op = IT_OPEN };
773 struct ll_sb_info *sbi = ll_i2sbi(inode);
774 struct md_op_data *op_data;
775 struct ptlrpc_request *req = NULL;
776 struct lustre_handle old_handle = { 0 };
777 struct obd_client_handle *och = NULL;
782 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
783 RETURN(ERR_PTR(-EINVAL));
786 struct ll_inode_info *lli = ll_i2info(inode);
787 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
788 struct obd_client_handle **och_p;
791 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
792 RETURN(ERR_PTR(-EPERM));
794 /* Get the openhandle of the file */
796 mutex_lock(&lli->lli_och_mutex);
797 if (fd->fd_lease_och != NULL) {
798 mutex_unlock(&lli->lli_och_mutex);
802 if (fd->fd_och == NULL) {
803 if (file->f_mode & FMODE_WRITE) {
804 LASSERT(lli->lli_mds_write_och != NULL);
805 och_p = &lli->lli_mds_write_och;
806 och_usecount = &lli->lli_open_fd_write_count;
808 LASSERT(lli->lli_mds_read_och != NULL);
809 och_p = &lli->lli_mds_read_och;
810 och_usecount = &lli->lli_open_fd_read_count;
812 if (*och_usecount == 1) {
819 mutex_unlock(&lli->lli_och_mutex);
820 if (rc < 0) /* more than 1 opener */
823 LASSERT(fd->fd_och != NULL);
824 old_handle = fd->fd_och->och_fh;
829 RETURN(ERR_PTR(-ENOMEM));
831 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
832 LUSTRE_OPC_ANY, NULL);
834 GOTO(out, rc = PTR_ERR(op_data));
836 /* To tell the MDT this openhandle is from the same owner */
837 op_data->op_handle = old_handle;
839 it.it_flags = fmode | open_flags;
840 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
841 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
842 &ll_md_blocking_lease_ast,
843 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
844 * it can be cancelled which may mislead applications that the lease is
846 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
847 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
848 * doesn't deal with openhandle, so normal openhandle will be leaked. */
849 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
850 ll_finish_md_op_data(op_data);
851 ptlrpc_req_finished(req);
853 GOTO(out_release_it, rc);
855 if (it_disposition(&it, DISP_LOOKUP_NEG))
856 GOTO(out_release_it, rc = -ENOENT);
858 rc = it_open_error(DISP_OPEN_OPEN, &it);
860 GOTO(out_release_it, rc);
862 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
863 ll_och_fill(sbi->ll_md_exp, &it, och);
865 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
866 GOTO(out_close, rc = -EOPNOTSUPP);
868 /* already get lease, handle lease lock */
869 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
870 if (it.d.lustre.it_lock_mode == 0 ||
871 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
872 /* open lock must return for lease */
873 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
874 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
875 it.d.lustre.it_lock_bits);
876 GOTO(out_close, rc = -EPROTO);
879 ll_intent_release(&it);
883 /* Cancel open lock */
884 if (it.d.lustre.it_lock_mode != 0) {
885 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
886 it.d.lustre.it_lock_mode);
887 it.d.lustre.it_lock_mode = 0;
888 och->och_lease_handle.cookie = 0ULL;
890 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
892 CERROR("%s: error closing file "DFID": %d\n",
893 ll_get_fsname(inode->i_sb, NULL, 0),
894 PFID(&ll_i2info(inode)->lli_fid), rc2);
895 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
897 ll_intent_release(&it);
905 * Release lease and close the file.
906 * It will check if the lease has ever broken.
908 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
911 struct ldlm_lock *lock;
912 bool cancelled = true;
916 lock = ldlm_handle2lock(&och->och_lease_handle);
918 lock_res_and_lock(lock);
919 cancelled = ldlm_is_cancel(lock);
920 unlock_res_and_lock(lock);
924 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
925 PFID(&ll_i2info(inode)->lli_fid), cancelled);
928 ldlm_cli_cancel(&och->och_lease_handle, 0);
929 if (lease_broken != NULL)
930 *lease_broken = cancelled;
932 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
937 /* Fills the obdo with the attributes for the lsm */
938 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
939 struct obd_capa *capa, struct obdo *obdo,
940 __u64 ioepoch, int dv_flags)
942 struct ptlrpc_request_set *set;
943 struct obd_info oinfo = { { { 0 } } };
948 LASSERT(lsm != NULL);
952 oinfo.oi_oa->o_oi = lsm->lsm_oi;
953 oinfo.oi_oa->o_mode = S_IFREG;
954 oinfo.oi_oa->o_ioepoch = ioepoch;
955 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
956 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
957 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
958 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
959 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
960 OBD_MD_FLDATAVERSION;
961 oinfo.oi_capa = capa;
962 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
963 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
964 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
965 if (dv_flags & LL_DV_WR_FLUSH)
966 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
969 set = ptlrpc_prep_set();
971 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
974 rc = obd_getattr_async(exp, &oinfo, set);
976 rc = ptlrpc_set_wait(set);
977 ptlrpc_set_destroy(set);
980 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
981 OBD_MD_FLATIME | OBD_MD_FLMTIME |
982 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
983 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
984 if (dv_flags & LL_DV_WR_FLUSH &&
985 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
986 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
993 * Performs the getattr on the inode and updates its fields.
994 * If @sync != 0, perform the getattr under the server-side lock.
996 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
997 __u64 ioepoch, int sync)
999 struct obd_capa *capa = ll_mdscapa_get(inode);
1000 struct lov_stripe_md *lsm;
1004 lsm = ccc_inode_lsm_get(inode);
1005 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1006 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1009 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1011 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1012 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1013 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1014 (unsigned long long)inode->i_blocks,
1015 1UL << inode->i_blkbits);
1017 ccc_inode_lsm_put(inode, lsm);
1021 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1023 struct ll_inode_info *lli = ll_i2info(inode);
1024 struct cl_object *obj = lli->lli_clob;
1025 struct cl_attr *attr = ccc_env_thread_attr(env);
1031 ll_inode_size_lock(inode);
1032 /* merge timestamps the most recently obtained from mds with
1033 timestamps obtained from osts */
1034 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1035 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1036 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1038 lvb.lvb_size = i_size_read(inode);
1039 lvb.lvb_blocks = inode->i_blocks;
1040 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1041 lvb.lvb_atime = LTIME_S(inode->i_atime);
1042 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1044 cl_object_attr_lock(obj);
1045 rc = cl_object_attr_get(env, obj, attr);
1046 cl_object_attr_unlock(obj);
1049 if (lvb.lvb_atime < attr->cat_atime)
1050 lvb.lvb_atime = attr->cat_atime;
1051 if (lvb.lvb_ctime < attr->cat_ctime)
1052 lvb.lvb_ctime = attr->cat_ctime;
1053 if (lvb.lvb_mtime < attr->cat_mtime)
1054 lvb.lvb_mtime = attr->cat_mtime;
1056 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1057 PFID(&lli->lli_fid), attr->cat_size);
1058 cl_isize_write_nolock(inode, attr->cat_size);
1060 inode->i_blocks = attr->cat_blocks;
1062 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1063 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1064 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1066 ll_inode_size_unlock(inode);
1071 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1074 struct obdo obdo = { 0 };
1077 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1079 st->st_size = obdo.o_size;
1080 st->st_blocks = obdo.o_blocks;
1081 st->st_mtime = obdo.o_mtime;
1082 st->st_atime = obdo.o_atime;
1083 st->st_ctime = obdo.o_ctime;
1088 static bool file_is_noatime(const struct file *file)
1090 const struct vfsmount *mnt = file->f_path.mnt;
1091 const struct inode *inode = file->f_path.dentry->d_inode;
1093 /* Adapted from file_accessed() and touch_atime().*/
1094 if (file->f_flags & O_NOATIME)
1097 if (inode->i_flags & S_NOATIME)
1100 if (IS_NOATIME(inode))
1103 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1106 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1109 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1115 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1117 struct inode *inode = file->f_dentry->d_inode;
1119 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1121 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1122 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1123 file->f_flags & O_DIRECT ||
1126 io->ci_obj = ll_i2info(inode)->lli_clob;
1127 io->ci_lockreq = CILR_MAYBE;
1128 if (ll_file_nolock(file)) {
1129 io->ci_lockreq = CILR_NEVER;
1130 io->ci_no_srvlock = 1;
1131 } else if (file->f_flags & O_APPEND) {
1132 io->ci_lockreq = CILR_MANDATORY;
1135 io->ci_noatime = file_is_noatime(file);
1139 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1140 struct file *file, enum cl_io_type iot,
1141 loff_t *ppos, size_t count)
1143 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147 struct range_lock range;
1150 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1151 file->f_dentry->d_name.name, iot, *ppos, count);
1154 io = ccc_env_thread_io(env);
1155 ll_io_init(io, file, iot == CIT_WRITE);
1157 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1158 struct vvp_io *vio = vvp_env_io(env);
1159 struct ccc_io *cio = ccc_env_io(env);
1160 bool range_locked = false;
1162 if (file->f_flags & O_APPEND)
1163 range_lock_init(&range, 0, LUSTRE_EOF);
1165 range_lock_init(&range, *ppos, *ppos + count - 1);
1166 cio->cui_fd = LUSTRE_FPRIVATE(file);
1167 vio->cui_io_subtype = args->via_io_subtype;
1169 switch (vio->cui_io_subtype) {
1171 cio->cui_iov = args->u.normal.via_iov;
1172 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1173 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1174 cio->cui_iocb = args->u.normal.via_iocb;
1175 if ((iot == CIT_WRITE) &&
1176 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1177 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1179 result = range_lock(&lli->lli_write_tree,
1184 range_locked = true;
1186 down_read(&lli->lli_trunc_sem);
1189 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1190 vio->u.splice.cui_flags = args->u.splice.via_flags;
1193 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1197 ll_cl_add(file, env, io);
1198 result = cl_io_loop(env, io);
1199 ll_cl_remove(file, env);
1201 if (args->via_io_subtype == IO_NORMAL)
1202 up_read(&lli->lli_trunc_sem);
1204 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1206 range_unlock(&lli->lli_write_tree, &range);
1209 /* cl_io_rw_init() handled IO */
1210 result = io->ci_result;
1213 if (io->ci_nob > 0) {
1214 result = io->ci_nob;
1215 *ppos = io->u.ci_wr.wr.crw_pos;
1219 cl_io_fini(env, io);
1220 /* If any bit been read/written (result != 0), we just return
1221 * short read/write instead of restart io. */
1222 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1223 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1224 iot == CIT_READ ? "read" : "write",
1225 file->f_dentry->d_name.name, *ppos, count);
1226 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1230 if (iot == CIT_READ) {
1232 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1233 LPROC_LL_READ_BYTES, result);
1234 } else if (iot == CIT_WRITE) {
1236 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1237 LPROC_LL_WRITE_BYTES, result);
1238 fd->fd_write_failed = false;
1239 } else if (result != -ERESTARTSYS) {
1240 fd->fd_write_failed = true;
1243 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1250 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1252 static int ll_file_get_iov_count(const struct iovec *iov,
1253 unsigned long *nr_segs, size_t *count)
1258 for (seg = 0; seg < *nr_segs; seg++) {
1259 const struct iovec *iv = &iov[seg];
1262 * If any segment has a negative length, or the cumulative
1263 * length ever wraps negative then return -EINVAL.
1266 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1268 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1273 cnt -= iv->iov_len; /* This segment is no good */
1280 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1281 unsigned long nr_segs, loff_t pos)
1284 struct vvp_io_args *args;
1290 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1294 env = cl_env_get(&refcheck);
1296 RETURN(PTR_ERR(env));
1298 args = vvp_env_args(env, IO_NORMAL);
1299 args->u.normal.via_iov = (struct iovec *)iov;
1300 args->u.normal.via_nrsegs = nr_segs;
1301 args->u.normal.via_iocb = iocb;
1303 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1304 &iocb->ki_pos, count);
1305 cl_env_put(env, &refcheck);
1309 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1313 struct iovec *local_iov;
1314 struct kiocb *kiocb;
1319 env = cl_env_get(&refcheck);
1321 RETURN(PTR_ERR(env));
1323 local_iov = &vvp_env_info(env)->vti_local_iov;
1324 kiocb = &vvp_env_info(env)->vti_kiocb;
1325 local_iov->iov_base = (void __user *)buf;
1326 local_iov->iov_len = count;
1327 init_sync_kiocb(kiocb, file);
1328 kiocb->ki_pos = *ppos;
1329 #ifdef HAVE_KIOCB_KI_LEFT
1330 kiocb->ki_left = count;
1332 kiocb->ki_nbytes = count;
1335 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1336 *ppos = kiocb->ki_pos;
1338 cl_env_put(env, &refcheck);
1343 * Write to a file (through the page cache).
1346 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1347 unsigned long nr_segs, loff_t pos)
1350 struct vvp_io_args *args;
1356 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1360 env = cl_env_get(&refcheck);
1362 RETURN(PTR_ERR(env));
1364 args = vvp_env_args(env, IO_NORMAL);
1365 args->u.normal.via_iov = (struct iovec *)iov;
1366 args->u.normal.via_nrsegs = nr_segs;
1367 args->u.normal.via_iocb = iocb;
1369 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1370 &iocb->ki_pos, count);
1371 cl_env_put(env, &refcheck);
1375 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1376 size_t count, loff_t *ppos)
1379 struct iovec *local_iov;
1380 struct kiocb *kiocb;
1385 env = cl_env_get(&refcheck);
1387 RETURN(PTR_ERR(env));
1389 local_iov = &vvp_env_info(env)->vti_local_iov;
1390 kiocb = &vvp_env_info(env)->vti_kiocb;
1391 local_iov->iov_base = (void __user *)buf;
1392 local_iov->iov_len = count;
1393 init_sync_kiocb(kiocb, file);
1394 kiocb->ki_pos = *ppos;
1395 #ifdef HAVE_KIOCB_KI_LEFT
1396 kiocb->ki_left = count;
1398 kiocb->ki_nbytes = count;
1401 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1402 *ppos = kiocb->ki_pos;
1404 cl_env_put(env, &refcheck);
1409 * Send file content (through pagecache) somewhere with helper
1411 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1412 struct pipe_inode_info *pipe, size_t count,
1416 struct vvp_io_args *args;
1421 env = cl_env_get(&refcheck);
1423 RETURN(PTR_ERR(env));
1425 args = vvp_env_args(env, IO_SPLICE);
1426 args->u.splice.via_pipe = pipe;
1427 args->u.splice.via_flags = flags;
1429 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1430 cl_env_put(env, &refcheck);
1434 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1437 struct obd_export *exp = ll_i2dtexp(inode);
1438 struct obd_trans_info oti = { 0 };
1439 struct obdo *oa = NULL;
1442 struct lov_stripe_md *lsm = NULL, *lsm2;
1449 lsm = ccc_inode_lsm_get(inode);
1450 if (!lsm_has_objects(lsm))
1451 GOTO(out, rc = -ENOENT);
1453 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1454 (lsm->lsm_stripe_count));
1456 OBD_ALLOC_LARGE(lsm2, lsm_size);
1458 GOTO(out, rc = -ENOMEM);
1461 oa->o_nlink = ost_idx;
1462 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1463 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1464 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1465 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1466 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1467 memcpy(lsm2, lsm, lsm_size);
1468 ll_inode_size_lock(inode);
1469 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1470 ll_inode_size_unlock(inode);
1472 OBD_FREE_LARGE(lsm2, lsm_size);
1475 ccc_inode_lsm_put(inode, lsm);
1480 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1482 struct ll_recreate_obj ucreat;
1486 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1489 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1493 ostid_set_seq_mdt0(&oi);
1494 ostid_set_id(&oi, ucreat.lrc_id);
1495 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1498 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1505 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1508 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1511 fid_to_ostid(&fid, &oi);
1512 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1513 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1516 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1517 __u64 flags, struct lov_user_md *lum,
1520 struct lov_stripe_md *lsm = NULL;
1521 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1525 lsm = ccc_inode_lsm_get(inode);
1527 ccc_inode_lsm_put(inode, lsm);
1528 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1529 PFID(ll_inode2fid(inode)));
1530 GOTO(out, rc = -EEXIST);
1533 ll_inode_size_lock(inode);
1534 oit.it_flags |= MDS_OPEN_BY_FID;
1535 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1537 GOTO(out_unlock, rc);
1538 rc = oit.d.lustre.it_status;
1540 GOTO(out_req_free, rc);
1542 ll_release_openhandle(file->f_dentry, &oit);
1545 ll_inode_size_unlock(inode);
1546 ll_intent_release(&oit);
1547 ccc_inode_lsm_put(inode, lsm);
1549 cl_lov_delay_create_clear(&file->f_flags);
1552 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1556 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1557 struct lov_mds_md **lmmp, int *lmm_size,
1558 struct ptlrpc_request **request)
1560 struct ll_sb_info *sbi = ll_i2sbi(inode);
1561 struct mdt_body *body;
1562 struct lov_mds_md *lmm = NULL;
1563 struct ptlrpc_request *req = NULL;
1564 struct md_op_data *op_data;
1567 rc = ll_get_default_mdsize(sbi, &lmmsize);
1571 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1572 strlen(filename), lmmsize,
1573 LUSTRE_OPC_ANY, NULL);
1574 if (IS_ERR(op_data))
1575 RETURN(PTR_ERR(op_data));
1577 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1578 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1579 ll_finish_md_op_data(op_data);
1581 CDEBUG(D_INFO, "md_getattr_name failed "
1582 "on %s: rc %d\n", filename, rc);
1586 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1587 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1589 lmmsize = body->mbo_eadatasize;
1591 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1593 GOTO(out, rc = -ENODATA);
1596 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1597 LASSERT(lmm != NULL);
1599 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1600 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1601 GOTO(out, rc = -EPROTO);
1605 * This is coming from the MDS, so is probably in
1606 * little endian. We convert it to host endian before
1607 * passing it to userspace.
1609 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1612 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1613 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1616 /* if function called for directory - we should
1617 * avoid swab not existent lsm objects */
1618 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1619 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1620 if (S_ISREG(body->mbo_mode))
1621 lustre_swab_lov_user_md_objects(
1622 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1624 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1625 lustre_swab_lov_user_md_v3(
1626 (struct lov_user_md_v3 *)lmm);
1627 if (S_ISREG(body->mbo_mode))
1628 lustre_swab_lov_user_md_objects(
1629 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1636 *lmm_size = lmmsize;
1641 static int ll_lov_setea(struct inode *inode, struct file *file,
1644 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1645 struct lov_user_md *lump;
1646 int lum_size = sizeof(struct lov_user_md) +
1647 sizeof(struct lov_user_ost_data);
1651 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1654 OBD_ALLOC_LARGE(lump, lum_size);
1658 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1659 OBD_FREE_LARGE(lump, lum_size);
1663 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1665 OBD_FREE_LARGE(lump, lum_size);
1669 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1672 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1673 struct lov_user_md *klum;
1675 __u64 flags = FMODE_WRITE;
1678 rc = ll_copy_user_md(lum, &klum);
1683 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1685 struct lov_stripe_md *lsm;
1688 put_user(0, &lum->lmm_stripe_count);
1690 ll_layout_refresh(inode, &gen);
1691 lsm = ccc_inode_lsm_get(inode);
1692 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1694 ccc_inode_lsm_put(inode, lsm);
1697 OBD_FREE(klum, lum_size);
1701 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1703 struct lov_stripe_md *lsm;
1707 lsm = ccc_inode_lsm_get(inode);
1709 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1710 lsm, (void __user *)arg);
1711 ccc_inode_lsm_put(inode, lsm);
1716 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1718 struct ll_inode_info *lli = ll_i2info(inode);
1719 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1720 struct ccc_grouplock grouplock;
1724 if (ll_file_nolock(file))
1725 RETURN(-EOPNOTSUPP);
1727 spin_lock(&lli->lli_lock);
1728 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1729 CWARN("group lock already existed with gid %lu\n",
1730 fd->fd_grouplock.cg_gid);
1731 spin_unlock(&lli->lli_lock);
1734 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1735 spin_unlock(&lli->lli_lock);
1737 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1738 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1742 spin_lock(&lli->lli_lock);
1743 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1744 spin_unlock(&lli->lli_lock);
1745 CERROR("another thread just won the race\n");
1746 cl_put_grouplock(&grouplock);
1750 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1751 fd->fd_grouplock = grouplock;
1752 spin_unlock(&lli->lli_lock);
1754 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1758 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1760 struct ll_inode_info *lli = ll_i2info(inode);
1761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1762 struct ccc_grouplock grouplock;
1765 spin_lock(&lli->lli_lock);
1766 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1767 spin_unlock(&lli->lli_lock);
1768 CWARN("no group lock held\n");
1771 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1773 if (fd->fd_grouplock.cg_gid != arg) {
1774 CWARN("group lock %lu doesn't match current id %lu\n",
1775 arg, fd->fd_grouplock.cg_gid);
1776 spin_unlock(&lli->lli_lock);
1780 grouplock = fd->fd_grouplock;
1781 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1782 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1783 spin_unlock(&lli->lli_lock);
1785 cl_put_grouplock(&grouplock);
1786 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1791 * Close inode open handle
1793 * \param dentry [in] dentry which contains the inode
1794 * \param it [in,out] intent which contains open info and result
1797 * \retval <0 failure
1799 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1801 struct inode *inode = dentry->d_inode;
1802 struct obd_client_handle *och;
1808 /* Root ? Do nothing. */
1809 if (dentry->d_inode->i_sb->s_root == dentry)
1812 /* No open handle to close? Move away */
1813 if (!it_disposition(it, DISP_OPEN_OPEN))
1816 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1818 OBD_ALLOC(och, sizeof(*och));
1820 GOTO(out, rc = -ENOMEM);
1822 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1824 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1827 /* this one is in place of ll_file_open */
1828 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1829 ptlrpc_req_finished(it->d.lustre.it_data);
1830 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1836 * Get size for inode for which FIEMAP mapping is requested.
1837 * Make the FIEMAP get_info call and returns the result.
1839 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1842 struct obd_export *exp = ll_i2dtexp(inode);
1843 struct lov_stripe_md *lsm = NULL;
1844 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1845 __u32 vallen = num_bytes;
1849 /* Checks for fiemap flags */
1850 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1851 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1855 /* Check for FIEMAP_FLAG_SYNC */
1856 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1857 rc = filemap_fdatawrite(inode->i_mapping);
1862 lsm = ccc_inode_lsm_get(inode);
1866 /* If the stripe_count > 1 and the application does not understand
1867 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1869 if (lsm->lsm_stripe_count > 1 &&
1870 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1871 GOTO(out, rc = -EOPNOTSUPP);
1873 fm_key.oa.o_oi = lsm->lsm_oi;
1874 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1876 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1877 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1878 /* If filesize is 0, then there would be no objects for mapping */
1879 if (fm_key.oa.o_size == 0) {
1880 fiemap->fm_mapped_extents = 0;
1884 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1886 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1889 CERROR("obd_get_info failed: rc = %d\n", rc);
1892 ccc_inode_lsm_put(inode, lsm);
1896 int ll_fid2path(struct inode *inode, void __user *arg)
1898 struct obd_export *exp = ll_i2mdexp(inode);
1899 const struct getinfo_fid2path __user *gfin = arg;
1901 struct getinfo_fid2path *gfout;
1907 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1908 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1911 /* Only need to get the buflen */
1912 if (get_user(pathlen, &gfin->gf_pathlen))
1915 if (pathlen > PATH_MAX)
1918 outsize = sizeof(*gfout) + pathlen;
1919 OBD_ALLOC(gfout, outsize);
1923 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1924 GOTO(gf_free, rc = -EFAULT);
1926 /* Call mdc_iocontrol */
1927 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1931 if (copy_to_user(arg, gfout, outsize))
1935 OBD_FREE(gfout, outsize);
1939 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1941 struct ll_user_fiemap *fiemap_s;
1942 size_t num_bytes, ret_bytes;
1943 unsigned int extent_count;
1946 /* Get the extent count so we can calculate the size of
1947 * required fiemap buffer */
1948 if (get_user(extent_count,
1949 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1953 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1955 num_bytes = sizeof(*fiemap_s) + (extent_count *
1956 sizeof(struct ll_fiemap_extent));
1958 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1959 if (fiemap_s == NULL)
1962 /* get the fiemap value */
1963 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1965 GOTO(error, rc = -EFAULT);
1967 /* If fm_extent_count is non-zero, read the first extent since
1968 * it is used to calculate end_offset and device from previous
1971 if (copy_from_user(&fiemap_s->fm_extents[0],
1972 (char __user *)arg + sizeof(*fiemap_s),
1973 sizeof(struct ll_fiemap_extent)))
1974 GOTO(error, rc = -EFAULT);
1977 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1981 ret_bytes = sizeof(struct ll_user_fiemap);
1983 if (extent_count != 0)
1984 ret_bytes += (fiemap_s->fm_mapped_extents *
1985 sizeof(struct ll_fiemap_extent));
1987 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1991 OBD_FREE_LARGE(fiemap_s, num_bytes);
1996 * Read the data_version for inode.
1998 * This value is computed using stripe object version on OST.
1999 * Version is computed using server side locking.
2001 * @param sync if do sync on the OST side;
2003 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2004 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2006 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2008 struct lov_stripe_md *lsm = NULL;
2009 struct ll_sb_info *sbi = ll_i2sbi(inode);
2010 struct obdo *obdo = NULL;
2014 /* If no stripe, we consider version is 0. */
2015 lsm = ccc_inode_lsm_get(inode);
2016 if (!lsm_has_objects(lsm)) {
2018 CDEBUG(D_INODE, "No object for inode\n");
2022 OBD_ALLOC_PTR(obdo);
2024 GOTO(out, rc = -ENOMEM);
2026 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2028 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2031 *data_version = obdo->o_data_version;
2037 ccc_inode_lsm_put(inode, lsm);
2042 * Trigger a HSM release request for the provided inode.
2044 int ll_hsm_release(struct inode *inode)
2046 struct cl_env_nest nest;
2048 struct obd_client_handle *och = NULL;
2049 __u64 data_version = 0;
2053 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2054 ll_get_fsname(inode->i_sb, NULL, 0),
2055 PFID(&ll_i2info(inode)->lli_fid));
2057 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2059 GOTO(out, rc = PTR_ERR(och));
2061 /* Grab latest data_version and [am]time values */
2062 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2066 env = cl_env_nested_get(&nest);
2068 GOTO(out, rc = PTR_ERR(env));
2070 ll_merge_lvb(env, inode);
2071 cl_env_nested_put(&nest, env);
2073 /* Release the file.
2074 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2075 * we still need it to pack l_remote_handle to MDT. */
2076 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2082 if (och != NULL && !IS_ERR(och)) /* close the file */
2083 ll_lease_close(och, inode, NULL);
2088 struct ll_swap_stack {
2089 struct iattr ia1, ia2;
2091 struct inode *inode1, *inode2;
2092 bool check_dv1, check_dv2;
2095 static int ll_swap_layouts(struct file *file1, struct file *file2,
2096 struct lustre_swap_layouts *lsl)
2098 struct mdc_swap_layouts msl;
2099 struct md_op_data *op_data;
2102 struct ll_swap_stack *llss = NULL;
2105 OBD_ALLOC_PTR(llss);
2109 llss->inode1 = file1->f_dentry->d_inode;
2110 llss->inode2 = file2->f_dentry->d_inode;
2112 if (!S_ISREG(llss->inode2->i_mode))
2113 GOTO(free, rc = -EINVAL);
2115 if (inode_permission(llss->inode1, MAY_WRITE) ||
2116 inode_permission(llss->inode2, MAY_WRITE))
2117 GOTO(free, rc = -EPERM);
2119 if (llss->inode2->i_sb != llss->inode1->i_sb)
2120 GOTO(free, rc = -EXDEV);
2122 /* we use 2 bool because it is easier to swap than 2 bits */
2123 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2124 llss->check_dv1 = true;
2126 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2127 llss->check_dv2 = true;
2129 /* we cannot use lsl->sl_dvX directly because we may swap them */
2130 llss->dv1 = lsl->sl_dv1;
2131 llss->dv2 = lsl->sl_dv2;
2133 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2134 if (rc == 0) /* same file, done! */
2137 if (rc < 0) { /* sequentialize it */
2138 swap(llss->inode1, llss->inode2);
2140 swap(llss->dv1, llss->dv2);
2141 swap(llss->check_dv1, llss->check_dv2);
2145 if (gid != 0) { /* application asks to flush dirty cache */
2146 rc = ll_get_grouplock(llss->inode1, file1, gid);
2150 rc = ll_get_grouplock(llss->inode2, file2, gid);
2152 ll_put_grouplock(llss->inode1, file1, gid);
2157 /* to be able to restore mtime and atime after swap
2158 * we need to first save them */
2160 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2161 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2162 llss->ia1.ia_atime = llss->inode1->i_atime;
2163 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2164 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2165 llss->ia2.ia_atime = llss->inode2->i_atime;
2166 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2169 /* ultimate check, before swaping the layouts we check if
2170 * dataversion has changed (if requested) */
2171 if (llss->check_dv1) {
2172 rc = ll_data_version(llss->inode1, &dv, 0);
2175 if (dv != llss->dv1)
2176 GOTO(putgl, rc = -EAGAIN);
2179 if (llss->check_dv2) {
2180 rc = ll_data_version(llss->inode2, &dv, 0);
2183 if (dv != llss->dv2)
2184 GOTO(putgl, rc = -EAGAIN);
2187 /* struct md_op_data is used to send the swap args to the mdt
2188 * only flags is missing, so we use struct mdc_swap_layouts
2189 * through the md_op_data->op_data */
2190 /* flags from user space have to be converted before they are send to
2191 * server, no flag is sent today, they are only used on the client */
2194 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2195 0, LUSTRE_OPC_ANY, &msl);
2196 if (IS_ERR(op_data))
2197 GOTO(free, rc = PTR_ERR(op_data));
2199 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2200 sizeof(*op_data), op_data, NULL);
2201 ll_finish_md_op_data(op_data);
2205 ll_put_grouplock(llss->inode2, file2, gid);
2206 ll_put_grouplock(llss->inode1, file1, gid);
2209 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2213 /* clear useless flags */
2214 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2215 llss->ia1.ia_valid &= ~ATTR_MTIME;
2216 llss->ia2.ia_valid &= ~ATTR_MTIME;
2219 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2220 llss->ia1.ia_valid &= ~ATTR_ATIME;
2221 llss->ia2.ia_valid &= ~ATTR_ATIME;
2224 /* update time if requested */
2226 if (llss->ia2.ia_valid != 0) {
2227 mutex_lock(&llss->inode1->i_mutex);
2228 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2229 mutex_unlock(&llss->inode1->i_mutex);
2232 if (llss->ia1.ia_valid != 0) {
2235 mutex_lock(&llss->inode2->i_mutex);
2236 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2237 mutex_unlock(&llss->inode2->i_mutex);
2249 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2251 struct md_op_data *op_data;
2254 /* Non-root users are forbidden to set or clear flags which are
2255 * NOT defined in HSM_USER_MASK. */
2256 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2257 !cfs_capable(CFS_CAP_SYS_ADMIN))
2260 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2261 LUSTRE_OPC_ANY, hss);
2262 if (IS_ERR(op_data))
2263 RETURN(PTR_ERR(op_data));
2265 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2266 sizeof(*op_data), op_data, NULL);
2268 ll_finish_md_op_data(op_data);
2273 static int ll_hsm_import(struct inode *inode, struct file *file,
2274 struct hsm_user_import *hui)
2276 struct hsm_state_set *hss = NULL;
2277 struct iattr *attr = NULL;
2281 if (!S_ISREG(inode->i_mode))
2287 GOTO(out, rc = -ENOMEM);
2289 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2290 hss->hss_archive_id = hui->hui_archive_id;
2291 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2292 rc = ll_hsm_state_set(inode, hss);
2296 OBD_ALLOC_PTR(attr);
2298 GOTO(out, rc = -ENOMEM);
2300 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2301 attr->ia_mode |= S_IFREG;
2302 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2303 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2304 attr->ia_size = hui->hui_size;
2305 attr->ia_mtime.tv_sec = hui->hui_mtime;
2306 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2307 attr->ia_atime.tv_sec = hui->hui_atime;
2308 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2310 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2311 ATTR_UID | ATTR_GID |
2312 ATTR_MTIME | ATTR_MTIME_SET |
2313 ATTR_ATIME | ATTR_ATIME_SET;
2315 mutex_lock(&inode->i_mutex);
2317 rc = ll_setattr_raw(file->f_dentry, attr, true);
2321 mutex_unlock(&inode->i_mutex);
2333 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2335 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2336 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2340 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2342 struct inode *inode = file->f_dentry->d_inode;
2343 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2347 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2348 PFID(ll_inode2fid(inode)), inode, cmd);
2349 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2351 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2352 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2356 case LL_IOC_GETFLAGS:
2357 /* Get the current value of the file flags */
2358 return put_user(fd->fd_flags, (int __user *)arg);
2359 case LL_IOC_SETFLAGS:
2360 case LL_IOC_CLRFLAGS:
2361 /* Set or clear specific file flags */
2362 /* XXX This probably needs checks to ensure the flags are
2363 * not abused, and to handle any flag side effects.
2365 if (get_user(flags, (int __user *) arg))
2368 if (cmd == LL_IOC_SETFLAGS) {
2369 if ((flags & LL_FILE_IGNORE_LOCK) &&
2370 !(file->f_flags & O_DIRECT)) {
2371 CERROR("%s: unable to disable locking on "
2372 "non-O_DIRECT file\n", current->comm);
2376 fd->fd_flags |= flags;
2378 fd->fd_flags &= ~flags;
2381 case LL_IOC_LOV_SETSTRIPE:
2382 RETURN(ll_lov_setstripe(inode, file, arg));
2383 case LL_IOC_LOV_SETEA:
2384 RETURN(ll_lov_setea(inode, file, arg));
2385 case LL_IOC_LOV_SWAP_LAYOUTS: {
2387 struct lustre_swap_layouts lsl;
2389 if (copy_from_user(&lsl, (char __user *)arg,
2390 sizeof(struct lustre_swap_layouts)))
2393 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2396 file2 = fget(lsl.sl_fd);
2401 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2402 rc = ll_swap_layouts(file, file2, &lsl);
2406 case LL_IOC_LOV_GETSTRIPE:
2407 RETURN(ll_lov_getstripe(inode, arg));
2408 case LL_IOC_RECREATE_OBJ:
2409 RETURN(ll_lov_recreate_obj(inode, arg));
2410 case LL_IOC_RECREATE_FID:
2411 RETURN(ll_lov_recreate_fid(inode, arg));
2412 case FSFILT_IOC_FIEMAP:
2413 RETURN(ll_ioctl_fiemap(inode, arg));
2414 case FSFILT_IOC_GETFLAGS:
2415 case FSFILT_IOC_SETFLAGS:
2416 RETURN(ll_iocontrol(inode, file, cmd, arg));
2417 case FSFILT_IOC_GETVERSION_OLD:
2418 case FSFILT_IOC_GETVERSION:
2419 RETURN(put_user(inode->i_generation, (int __user *)arg));
2420 case LL_IOC_GROUP_LOCK:
2421 RETURN(ll_get_grouplock(inode, file, arg));
2422 case LL_IOC_GROUP_UNLOCK:
2423 RETURN(ll_put_grouplock(inode, file, arg));
2424 case IOC_OBD_STATFS:
2425 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2427 /* We need to special case any other ioctls we want to handle,
2428 * to send them to the MDS/OST as appropriate and to properly
2429 * network encode the arg field.
2430 case FSFILT_IOC_SETVERSION_OLD:
2431 case FSFILT_IOC_SETVERSION:
2433 case LL_IOC_FLUSHCTX:
2434 RETURN(ll_flush_ctx(inode));
2435 case LL_IOC_PATH2FID: {
2436 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2437 sizeof(struct lu_fid)))
2442 case LL_IOC_GETPARENT:
2443 RETURN(ll_getparent(file, (void __user *)arg));
2445 case OBD_IOC_FID2PATH:
2446 RETURN(ll_fid2path(inode, (void __user *)arg));
2447 case LL_IOC_DATA_VERSION: {
2448 struct ioc_data_version idv;
2451 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2454 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2455 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2458 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2464 case LL_IOC_GET_MDTIDX: {
2467 mdtidx = ll_get_mdt_idx(inode);
2471 if (put_user((int)mdtidx, (int __user *)arg))
2476 case OBD_IOC_GETDTNAME:
2477 case OBD_IOC_GETMDNAME:
2478 RETURN(ll_get_obd_name(inode, cmd, arg));
2479 case LL_IOC_HSM_STATE_GET: {
2480 struct md_op_data *op_data;
2481 struct hsm_user_state *hus;
2488 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2489 LUSTRE_OPC_ANY, hus);
2490 if (IS_ERR(op_data)) {
2492 RETURN(PTR_ERR(op_data));
2495 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2498 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2501 ll_finish_md_op_data(op_data);
2505 case LL_IOC_HSM_STATE_SET: {
2506 struct hsm_state_set *hss;
2513 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2518 rc = ll_hsm_state_set(inode, hss);
2523 case LL_IOC_HSM_ACTION: {
2524 struct md_op_data *op_data;
2525 struct hsm_current_action *hca;
2532 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2533 LUSTRE_OPC_ANY, hca);
2534 if (IS_ERR(op_data)) {
2536 RETURN(PTR_ERR(op_data));
2539 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2542 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2545 ll_finish_md_op_data(op_data);
2549 case LL_IOC_SET_LEASE: {
2550 struct ll_inode_info *lli = ll_i2info(inode);
2551 struct obd_client_handle *och = NULL;
2556 case LL_LEASE_WRLCK:
2557 if (!(file->f_mode & FMODE_WRITE))
2559 fmode = FMODE_WRITE;
2561 case LL_LEASE_RDLCK:
2562 if (!(file->f_mode & FMODE_READ))
2566 case LL_LEASE_UNLCK:
2567 mutex_lock(&lli->lli_och_mutex);
2568 if (fd->fd_lease_och != NULL) {
2569 och = fd->fd_lease_och;
2570 fd->fd_lease_och = NULL;
2572 mutex_unlock(&lli->lli_och_mutex);
2577 fmode = och->och_flags;
2578 rc = ll_lease_close(och, inode, &lease_broken);
2585 RETURN(ll_lease_type_from_fmode(fmode));
2590 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2592 /* apply for lease */
2593 och = ll_lease_open(inode, file, fmode, 0);
2595 RETURN(PTR_ERR(och));
2598 mutex_lock(&lli->lli_och_mutex);
2599 if (fd->fd_lease_och == NULL) {
2600 fd->fd_lease_och = och;
2603 mutex_unlock(&lli->lli_och_mutex);
2605 /* impossible now that only excl is supported for now */
2606 ll_lease_close(och, inode, &lease_broken);
2611 case LL_IOC_GET_LEASE: {
2612 struct ll_inode_info *lli = ll_i2info(inode);
2613 struct ldlm_lock *lock = NULL;
2616 mutex_lock(&lli->lli_och_mutex);
2617 if (fd->fd_lease_och != NULL) {
2618 struct obd_client_handle *och = fd->fd_lease_och;
2620 lock = ldlm_handle2lock(&och->och_lease_handle);
2622 lock_res_and_lock(lock);
2623 if (!ldlm_is_cancel(lock))
2624 fmode = och->och_flags;
2626 unlock_res_and_lock(lock);
2627 LDLM_LOCK_PUT(lock);
2630 mutex_unlock(&lli->lli_och_mutex);
2632 RETURN(ll_lease_type_from_fmode(fmode));
2634 case LL_IOC_HSM_IMPORT: {
2635 struct hsm_user_import *hui;
2641 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2646 rc = ll_hsm_import(inode, file, hui);
2656 ll_iocontrol_call(inode, file, cmd, arg, &err))
2659 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2660 (void __user *)arg));
2665 #ifndef HAVE_FILE_LLSEEK_SIZE
2666 static inline loff_t
2667 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2669 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2671 if (offset > maxsize)
2674 if (offset != file->f_pos) {
2675 file->f_pos = offset;
2676 file->f_version = 0;
2682 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2683 loff_t maxsize, loff_t eof)
2685 struct inode *inode = file->f_dentry->d_inode;
2693 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2694 * position-querying operation. Avoid rewriting the "same"
2695 * f_pos value back to the file because a concurrent read(),
2696 * write() or lseek() might have altered it
2701 * f_lock protects against read/modify/write race with other
2702 * SEEK_CURs. Note that parallel writes and reads behave
2705 mutex_lock(&inode->i_mutex);
2706 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2707 mutex_unlock(&inode->i_mutex);
2711 * In the generic case the entire file is data, so as long as
2712 * offset isn't at the end of the file then the offset is data.
2719 * There is a virtual hole at the end of the file, so as long as
2720 * offset isn't i_size or larger, return i_size.
2728 return llseek_execute(file, offset, maxsize);
2732 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2734 struct inode *inode = file->f_dentry->d_inode;
2735 loff_t retval, eof = 0;
2738 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2739 (origin == SEEK_CUR) ? file->f_pos : 0);
2740 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2741 PFID(ll_inode2fid(inode)), inode, retval, retval,
2743 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2745 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2746 retval = ll_glimpse_size(inode);
2749 eof = i_size_read(inode);
2752 retval = ll_generic_file_llseek_size(file, offset, origin,
2753 ll_file_maxbytes(inode), eof);
2757 static int ll_flush(struct file *file, fl_owner_t id)
2759 struct inode *inode = file->f_dentry->d_inode;
2760 struct ll_inode_info *lli = ll_i2info(inode);
2761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2764 LASSERT(!S_ISDIR(inode->i_mode));
2766 /* catch async errors that were recorded back when async writeback
2767 * failed for pages in this mapping. */
2768 rc = lli->lli_async_rc;
2769 lli->lli_async_rc = 0;
2770 if (lli->lli_clob != NULL) {
2771 err = lov_read_and_clear_async_rc(lli->lli_clob);
2776 /* The application has been told write failure already.
2777 * Do not report failure again. */
2778 if (fd->fd_write_failed)
2780 return rc ? -EIO : 0;
2784 * Called to make sure a portion of file has been written out.
2785 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2787 * Return how many pages have been written.
2789 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2790 enum cl_fsync_mode mode, int ignore_layout)
2792 struct cl_env_nest nest;
2795 struct obd_capa *capa = NULL;
2796 struct cl_fsync_io *fio;
2800 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2801 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2804 env = cl_env_nested_get(&nest);
2806 RETURN(PTR_ERR(env));
2808 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2810 io = ccc_env_thread_io(env);
2811 io->ci_obj = cl_i2info(inode)->lli_clob;
2812 io->ci_ignore_layout = ignore_layout;
2814 /* initialize parameters for sync */
2815 fio = &io->u.ci_fsync;
2816 fio->fi_capa = capa;
2817 fio->fi_start = start;
2819 fio->fi_fid = ll_inode2fid(inode);
2820 fio->fi_mode = mode;
2821 fio->fi_nr_written = 0;
2823 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2824 result = cl_io_loop(env, io);
2826 result = io->ci_result;
2828 result = fio->fi_nr_written;
2829 cl_io_fini(env, io);
2830 cl_env_nested_put(&nest, env);
2838 * When dentry is provided (the 'else' case), *file->f_dentry may be
2839 * null and dentry must be used directly rather than pulled from
2840 * *file->f_dentry as is done otherwise.
2843 #ifdef HAVE_FILE_FSYNC_4ARGS
2844 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2846 struct dentry *dentry = file->f_dentry;
2847 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2848 int ll_fsync(struct file *file, int datasync)
2850 struct dentry *dentry = file->f_dentry;
2852 loff_t end = LLONG_MAX;
2854 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2857 loff_t end = LLONG_MAX;
2859 struct inode *inode = dentry->d_inode;
2860 struct ll_inode_info *lli = ll_i2info(inode);
2861 struct ptlrpc_request *req;
2862 struct obd_capa *oc;
2866 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2867 PFID(ll_inode2fid(inode)), inode);
2868 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2870 #ifdef HAVE_FILE_FSYNC_4ARGS
2871 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2872 mutex_lock(&inode->i_mutex);
2874 /* fsync's caller has already called _fdata{sync,write}, we want
2875 * that IO to finish before calling the osc and mdc sync methods */
2876 rc = filemap_fdatawait(inode->i_mapping);
2879 /* catch async errors that were recorded back when async writeback
2880 * failed for pages in this mapping. */
2881 if (!S_ISDIR(inode->i_mode)) {
2882 err = lli->lli_async_rc;
2883 lli->lli_async_rc = 0;
2886 err = lov_read_and_clear_async_rc(lli->lli_clob);
2891 oc = ll_mdscapa_get(inode);
2892 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2898 ptlrpc_req_finished(req);
2900 if (S_ISREG(inode->i_mode)) {
2901 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2903 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2904 if (rc == 0 && err < 0)
2907 fd->fd_write_failed = true;
2909 fd->fd_write_failed = false;
2912 #ifdef HAVE_FILE_FSYNC_4ARGS
2913 mutex_unlock(&inode->i_mutex);
2919 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2921 struct inode *inode = file->f_dentry->d_inode;
2922 struct ll_sb_info *sbi = ll_i2sbi(inode);
2923 struct ldlm_enqueue_info einfo = {
2924 .ei_type = LDLM_FLOCK,
2925 .ei_cb_cp = ldlm_flock_completion_ast,
2926 .ei_cbdata = file_lock,
2928 struct md_op_data *op_data;
2929 struct lustre_handle lockh = {0};
2930 ldlm_policy_data_t flock = {{0}};
2931 int fl_type = file_lock->fl_type;
2937 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2938 PFID(ll_inode2fid(inode)), file_lock);
2940 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2942 if (file_lock->fl_flags & FL_FLOCK) {
2943 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2944 /* flocks are whole-file locks */
2945 flock.l_flock.end = OFFSET_MAX;
2946 /* For flocks owner is determined by the local file desctiptor*/
2947 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2948 } else if (file_lock->fl_flags & FL_POSIX) {
2949 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2950 flock.l_flock.start = file_lock->fl_start;
2951 flock.l_flock.end = file_lock->fl_end;
2955 flock.l_flock.pid = file_lock->fl_pid;
2957 /* Somewhat ugly workaround for svc lockd.
2958 * lockd installs custom fl_lmops->lm_compare_owner that checks
2959 * for the fl_owner to be the same (which it always is on local node
2960 * I guess between lockd processes) and then compares pid.
2961 * As such we assign pid to the owner field to make it all work,
2962 * conflict with normal locks is unlikely since pid space and
2963 * pointer space for current->files are not intersecting */
2964 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2965 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2969 einfo.ei_mode = LCK_PR;
2972 /* An unlock request may or may not have any relation to
2973 * existing locks so we may not be able to pass a lock handle
2974 * via a normal ldlm_lock_cancel() request. The request may even
2975 * unlock a byte range in the middle of an existing lock. In
2976 * order to process an unlock request we need all of the same
2977 * information that is given with a normal read or write record
2978 * lock request. To avoid creating another ldlm unlock (cancel)
2979 * message we'll treat a LCK_NL flock request as an unlock. */
2980 einfo.ei_mode = LCK_NL;
2983 einfo.ei_mode = LCK_PW;
2986 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3001 flags = LDLM_FL_BLOCK_NOWAIT;
3007 flags = LDLM_FL_TEST_LOCK;
3010 CERROR("unknown fcntl lock command: %d\n", cmd);
3014 /* Save the old mode so that if the mode in the lock changes we
3015 * can decrement the appropriate reader or writer refcount. */
3016 file_lock->fl_type = einfo.ei_mode;
3018 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3019 LUSTRE_OPC_ANY, NULL);
3020 if (IS_ERR(op_data))
3021 RETURN(PTR_ERR(op_data));
3023 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3024 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3025 flock.l_flock.pid, flags, einfo.ei_mode,
3026 flock.l_flock.start, flock.l_flock.end);
3028 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3031 /* Restore the file lock type if not TEST lock. */
3032 if (!(flags & LDLM_FL_TEST_LOCK))
3033 file_lock->fl_type = fl_type;
3035 if ((file_lock->fl_flags & FL_FLOCK) &&
3036 (rc == 0 || file_lock->fl_type == F_UNLCK))
3037 rc2 = flock_lock_file_wait(file, file_lock);
3038 if ((file_lock->fl_flags & FL_POSIX) &&
3039 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3040 !(flags & LDLM_FL_TEST_LOCK))
3041 rc2 = posix_lock_file_wait(file, file_lock);
3043 if (rc2 && file_lock->fl_type != F_UNLCK) {
3044 einfo.ei_mode = LCK_NL;
3045 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3050 ll_finish_md_op_data(op_data);
3055 int ll_get_fid_by_name(struct inode *parent, const char *name,
3056 int namelen, struct lu_fid *fid)
3058 struct md_op_data *op_data = NULL;
3059 struct mdt_body *body;
3060 struct ptlrpc_request *req;
3064 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3065 LUSTRE_OPC_ANY, NULL);
3066 if (IS_ERR(op_data))
3067 RETURN(PTR_ERR(op_data));
3069 op_data->op_valid = OBD_MD_FLID;
3070 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3071 ll_finish_md_op_data(op_data);
3075 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3077 GOTO(out_req, rc = -EFAULT);
3079 *fid = body->mbo_fid1;
3081 ptlrpc_req_finished(req);
3085 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3086 const char *name, int namelen)
3088 struct dentry *dchild = NULL;
3089 struct inode *child_inode = NULL;
3090 struct md_op_data *op_data;
3091 struct ptlrpc_request *request = NULL;
3096 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3097 name, PFID(ll_inode2fid(parent)), mdtidx);
3099 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3100 0, LUSTRE_OPC_ANY, NULL);
3101 if (IS_ERR(op_data))
3102 RETURN(PTR_ERR(op_data));
3104 /* Get child FID first */
3105 qstr.hash = full_name_hash(name, namelen);
3108 dchild = d_lookup(file->f_dentry, &qstr);
3109 if (dchild != NULL && dchild->d_inode != NULL) {
3110 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3111 if (dchild->d_inode != NULL) {
3112 child_inode = igrab(dchild->d_inode);
3113 ll_invalidate_aliases(child_inode);
3117 rc = ll_get_fid_by_name(parent, name, namelen,
3123 if (!fid_is_sane(&op_data->op_fid3)) {
3124 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3125 ll_get_fsname(parent->i_sb, NULL, 0), name,
3126 PFID(&op_data->op_fid3));
3127 GOTO(out_free, rc = -EINVAL);
3130 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3135 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3136 PFID(&op_data->op_fid3), mdtidx);
3137 GOTO(out_free, rc = 0);
3140 op_data->op_mds = mdtidx;
3141 op_data->op_cli_flags = CLI_MIGRATE;
3142 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3143 namelen, name, namelen, &request);
3145 ll_update_times(request, parent);
3147 ptlrpc_req_finished(request);
3152 if (child_inode != NULL) {
3153 clear_nlink(child_inode);
3157 ll_finish_md_op_data(op_data);
3162 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3170 * test if some locks matching bits and l_req_mode are acquired
3171 * - bits can be in different locks
3172 * - if found clear the common lock bits in *bits
3173 * - the bits not found, are kept in *bits
3175 * \param bits [IN] searched lock bits [IN]
3176 * \param l_req_mode [IN] searched lock mode
3177 * \retval boolean, true iff all bits are found
3179 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3181 struct lustre_handle lockh;
3182 ldlm_policy_data_t policy;
3183 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3184 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3193 fid = &ll_i2info(inode)->lli_fid;
3194 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3195 ldlm_lockname[mode]);
3197 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3198 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3199 policy.l_inodebits.bits = *bits & (1 << i);
3200 if (policy.l_inodebits.bits == 0)
3203 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3204 &policy, mode, &lockh)) {
3205 struct ldlm_lock *lock;
3207 lock = ldlm_handle2lock(&lockh);
3210 ~(lock->l_policy_data.l_inodebits.bits);
3211 LDLM_LOCK_PUT(lock);
3213 *bits &= ~policy.l_inodebits.bits;
3220 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3221 struct lustre_handle *lockh, __u64 flags,
3224 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3229 fid = &ll_i2info(inode)->lli_fid;
3230 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3232 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3233 fid, LDLM_IBITS, &policy, mode, lockh);
3238 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3240 /* Already unlinked. Just update nlink and return success */
3241 if (rc == -ENOENT) {
3243 /* This path cannot be hit for regular files unless in
3244 * case of obscure races, so no need to to validate
3246 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3248 } else if (rc != 0) {
3249 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3250 "%s: revalidate FID "DFID" error: rc = %d\n",
3251 ll_get_fsname(inode->i_sb, NULL, 0),
3252 PFID(ll_inode2fid(inode)), rc);
3258 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3260 struct inode *inode = dentry->d_inode;
3261 struct ptlrpc_request *req = NULL;
3262 struct obd_export *exp;
3266 LASSERT(inode != NULL);
3268 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3269 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3271 exp = ll_i2mdexp(inode);
3273 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3274 * But under CMD case, it caused some lock issues, should be fixed
3275 * with new CMD ibits lock. See bug 12718 */
3276 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3277 struct lookup_intent oit = { .it_op = IT_GETATTR };
3278 struct md_op_data *op_data;
3280 if (ibits == MDS_INODELOCK_LOOKUP)
3281 oit.it_op = IT_LOOKUP;
3283 /* Call getattr by fid, so do not provide name at all. */
3284 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3285 dentry->d_inode, NULL, 0, 0,
3286 LUSTRE_OPC_ANY, NULL);
3287 if (IS_ERR(op_data))
3288 RETURN(PTR_ERR(op_data));
3290 rc = md_intent_lock(exp, op_data, &oit, &req,
3291 &ll_md_blocking_ast, 0);
3292 ll_finish_md_op_data(op_data);
3294 rc = ll_inode_revalidate_fini(inode, rc);
3298 rc = ll_revalidate_it_finish(req, &oit, dentry);
3300 ll_intent_release(&oit);
3304 /* Unlinked? Unhash dentry, so it is not picked up later by
3305 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3306 here to preserve get_cwd functionality on 2.6.
3308 if (!dentry->d_inode->i_nlink)
3309 d_lustre_invalidate(dentry, 0);
3311 ll_lookup_finish_locks(&oit, dentry);
3312 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3313 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3314 obd_valid valid = OBD_MD_FLGETATTR;
3315 struct md_op_data *op_data;
3318 if (S_ISREG(inode->i_mode)) {
3319 rc = ll_get_default_mdsize(sbi, &ealen);
3322 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3325 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3326 0, ealen, LUSTRE_OPC_ANY,
3328 if (IS_ERR(op_data))
3329 RETURN(PTR_ERR(op_data));
3331 op_data->op_valid = valid;
3332 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3333 * capa for this inode. Because we only keep capas of dirs
3335 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3336 ll_finish_md_op_data(op_data);
3338 rc = ll_inode_revalidate_fini(inode, rc);
3342 rc = ll_prep_inode(&inode, req, NULL, NULL);
3345 ptlrpc_req_finished(req);
3349 static int ll_merge_md_attr(struct inode *inode)
3351 struct cl_attr attr = { 0 };
3354 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3355 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3360 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3361 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3363 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3364 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3365 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3371 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3373 struct inode *inode = dentry->d_inode;
3377 rc = __ll_inode_revalidate(dentry, ibits);
3381 /* if object isn't regular file, don't validate size */
3382 if (!S_ISREG(inode->i_mode)) {
3383 if (S_ISDIR(inode->i_mode) &&
3384 ll_i2info(inode)->lli_lsm_md != NULL) {
3385 rc = ll_merge_md_attr(inode);
3390 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3391 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3392 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3394 /* In case of restore, the MDT has the right size and has
3395 * already send it back without granting the layout lock,
3396 * inode is up-to-date so glimpse is useless.
3397 * Also to glimpse we need the layout, in case of a running
3398 * restore the MDT holds the layout lock so the glimpse will
3399 * block up to the end of restore (getattr will block)
3401 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3402 rc = ll_glimpse_size(inode);
3407 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3409 struct inode *inode = de->d_inode;
3410 struct ll_sb_info *sbi = ll_i2sbi(inode);
3411 struct ll_inode_info *lli = ll_i2info(inode);
3414 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3415 MDS_INODELOCK_LOOKUP);
3416 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3421 stat->dev = inode->i_sb->s_dev;
3422 if (ll_need_32bit_api(sbi))
3423 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3425 stat->ino = inode->i_ino;
3426 stat->mode = inode->i_mode;
3427 stat->uid = inode->i_uid;
3428 stat->gid = inode->i_gid;
3429 stat->rdev = inode->i_rdev;
3430 stat->atime = inode->i_atime;
3431 stat->mtime = inode->i_mtime;
3432 stat->ctime = inode->i_ctime;
3433 stat->blksize = 1 << inode->i_blkbits;
3434 stat->blocks = inode->i_blocks;
3436 if (S_ISDIR(inode->i_mode) &&
3437 ll_i2info(inode)->lli_lsm_md != NULL) {
3438 stat->nlink = lli->lli_stripe_dir_nlink;
3439 stat->size = lli->lli_stripe_dir_size;
3441 stat->nlink = inode->i_nlink;
3442 stat->size = i_size_read(inode);
3448 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3449 __u64 start, __u64 len)
3453 struct ll_user_fiemap *fiemap;
3454 unsigned int extent_count = fieinfo->fi_extents_max;
3456 num_bytes = sizeof(*fiemap) + (extent_count *
3457 sizeof(struct ll_fiemap_extent));
3458 OBD_ALLOC_LARGE(fiemap, num_bytes);
3463 fiemap->fm_flags = fieinfo->fi_flags;
3464 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3465 fiemap->fm_start = start;
3466 fiemap->fm_length = len;
3467 if (extent_count > 0)
3468 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3469 sizeof(struct ll_fiemap_extent));
3471 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3473 fieinfo->fi_flags = fiemap->fm_flags;
3474 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3475 if (extent_count > 0)
3476 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3477 fiemap->fm_mapped_extents *
3478 sizeof(struct ll_fiemap_extent));
3480 OBD_FREE_LARGE(fiemap, num_bytes);
3484 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3486 struct ll_inode_info *lli = ll_i2info(inode);
3487 struct posix_acl *acl = NULL;
3490 spin_lock(&lli->lli_lock);
3491 /* VFS' acl_permission_check->check_acl will release the refcount */
3492 acl = posix_acl_dup(lli->lli_posix_acl);
3493 spin_unlock(&lli->lli_lock);
3498 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3500 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3501 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3503 ll_check_acl(struct inode *inode, int mask)
3506 # ifdef CONFIG_FS_POSIX_ACL
3507 struct posix_acl *acl;
3511 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3512 if (flags & IPERM_FLAG_RCU)
3515 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3520 rc = posix_acl_permission(inode, acl, mask);
3521 posix_acl_release(acl);
3524 # else /* !CONFIG_FS_POSIX_ACL */
3526 # endif /* CONFIG_FS_POSIX_ACL */
3528 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3530 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3531 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3533 # ifdef HAVE_INODE_PERMISION_2ARGS
3534 int ll_inode_permission(struct inode *inode, int mask)
3536 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3541 struct ll_sb_info *sbi;
3542 struct root_squash_info *squash;
3543 struct cred *cred = NULL;
3544 const struct cred *old_cred = NULL;
3546 bool squash_id = false;
3549 #ifdef MAY_NOT_BLOCK
3550 if (mask & MAY_NOT_BLOCK)
3552 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3553 if (flags & IPERM_FLAG_RCU)
3557 /* as root inode are NOT getting validated in lookup operation,
3558 * need to do it before permission check. */
3560 if (inode == inode->i_sb->s_root->d_inode) {
3561 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3562 MDS_INODELOCK_LOOKUP);
3567 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3568 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3570 /* squash fsuid/fsgid if needed */
3571 sbi = ll_i2sbi(inode);
3572 squash = &sbi->ll_squash;
3573 if (unlikely(squash->rsi_uid != 0 &&
3574 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3575 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3579 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3580 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3581 squash->rsi_uid, squash->rsi_gid);
3583 /* update current process's credentials
3584 * and FS capability */
3585 cred = prepare_creds();
3589 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3590 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3591 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3592 if ((1 << cap) & CFS_CAP_FS_MASK)
3593 cap_lower(cred->cap_effective, cap);
3595 old_cred = override_creds(cred);
3598 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3600 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3601 rc = lustre_check_remote_perm(inode, mask);
3603 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3605 /* restore current process's credentials and FS capability */
3607 revert_creds(old_cred);
3614 /* -o localflock - only provides locally consistent flock locks */
3615 struct file_operations ll_file_operations = {
3616 .read = ll_file_read,
3617 .aio_read = ll_file_aio_read,
3618 .write = ll_file_write,
3619 .aio_write = ll_file_aio_write,
3620 .unlocked_ioctl = ll_file_ioctl,
3621 .open = ll_file_open,
3622 .release = ll_file_release,
3623 .mmap = ll_file_mmap,
3624 .llseek = ll_file_seek,
3625 .splice_read = ll_file_splice_read,
3630 struct file_operations ll_file_operations_flock = {
3631 .read = ll_file_read,
3632 .aio_read = ll_file_aio_read,
3633 .write = ll_file_write,
3634 .aio_write = ll_file_aio_write,
3635 .unlocked_ioctl = ll_file_ioctl,
3636 .open = ll_file_open,
3637 .release = ll_file_release,
3638 .mmap = ll_file_mmap,
3639 .llseek = ll_file_seek,
3640 .splice_read = ll_file_splice_read,
3643 .flock = ll_file_flock,
3644 .lock = ll_file_flock
3647 /* These are for -o noflock - to return ENOSYS on flock calls */
3648 struct file_operations ll_file_operations_noflock = {
3649 .read = ll_file_read,
3650 .aio_read = ll_file_aio_read,
3651 .write = ll_file_write,
3652 .aio_write = ll_file_aio_write,
3653 .unlocked_ioctl = ll_file_ioctl,
3654 .open = ll_file_open,
3655 .release = ll_file_release,
3656 .mmap = ll_file_mmap,
3657 .llseek = ll_file_seek,
3658 .splice_read = ll_file_splice_read,
3661 .flock = ll_file_noflock,
3662 .lock = ll_file_noflock
3665 struct inode_operations ll_file_inode_operations = {
3666 .setattr = ll_setattr,
3667 .getattr = ll_getattr,
3668 .permission = ll_inode_permission,
3669 .setxattr = ll_setxattr,
3670 .getxattr = ll_getxattr,
3671 .listxattr = ll_listxattr,
3672 .removexattr = ll_removexattr,
3673 .fiemap = ll_fiemap,
3674 #ifdef HAVE_IOP_GET_ACL
3675 .get_acl = ll_get_acl,
3679 /* dynamic ioctl number support routins */
3680 static struct llioc_ctl_data {
3681 struct rw_semaphore ioc_sem;
3682 struct list_head ioc_head;
3684 __RWSEM_INITIALIZER(llioc.ioc_sem),
3685 LIST_HEAD_INIT(llioc.ioc_head)
3690 struct list_head iocd_list;
3691 unsigned int iocd_size;
3692 llioc_callback_t iocd_cb;
3693 unsigned int iocd_count;
3694 unsigned int iocd_cmd[0];
3697 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3700 struct llioc_data *in_data = NULL;
3703 if (cb == NULL || cmd == NULL ||
3704 count > LLIOC_MAX_CMD || count < 0)
3707 size = sizeof(*in_data) + count * sizeof(unsigned int);
3708 OBD_ALLOC(in_data, size);
3709 if (in_data == NULL)
3712 memset(in_data, 0, sizeof(*in_data));
3713 in_data->iocd_size = size;
3714 in_data->iocd_cb = cb;
3715 in_data->iocd_count = count;
3716 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3718 down_write(&llioc.ioc_sem);
3719 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3720 up_write(&llioc.ioc_sem);
3725 void ll_iocontrol_unregister(void *magic)
3727 struct llioc_data *tmp;
3732 down_write(&llioc.ioc_sem);
3733 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3735 unsigned int size = tmp->iocd_size;
3737 list_del(&tmp->iocd_list);
3738 up_write(&llioc.ioc_sem);
3740 OBD_FREE(tmp, size);
3744 up_write(&llioc.ioc_sem);
3746 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3749 EXPORT_SYMBOL(ll_iocontrol_register);
3750 EXPORT_SYMBOL(ll_iocontrol_unregister);
3752 static enum llioc_iter
3753 ll_iocontrol_call(struct inode *inode, struct file *file,
3754 unsigned int cmd, unsigned long arg, int *rcp)
3756 enum llioc_iter ret = LLIOC_CONT;
3757 struct llioc_data *data;
3758 int rc = -EINVAL, i;
3760 down_read(&llioc.ioc_sem);
3761 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3762 for (i = 0; i < data->iocd_count; i++) {
3763 if (cmd != data->iocd_cmd[i])
3766 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3770 if (ret == LLIOC_STOP)
3773 up_read(&llioc.ioc_sem);
3780 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3782 struct ll_inode_info *lli = ll_i2info(inode);
3783 struct cl_env_nest nest;
3788 if (lli->lli_clob == NULL)
3791 env = cl_env_nested_get(&nest);
3793 RETURN(PTR_ERR(env));
3795 result = cl_conf_set(env, lli->lli_clob, conf);
3796 cl_env_nested_put(&nest, env);
3798 if (conf->coc_opc == OBJECT_CONF_SET) {
3799 struct ldlm_lock *lock = conf->coc_lock;
3801 LASSERT(lock != NULL);
3802 LASSERT(ldlm_has_layout(lock));
3804 struct lustre_md *md = conf->u.coc_md;
3805 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3807 /* it can only be allowed to match after layout is
3808 * applied to inode otherwise false layout would be
3809 * seen. Applying layout shoud happen before dropping
3810 * the intent lock. */
3811 ldlm_lock_allow_match(lock);
3813 lli->lli_has_smd = lsm_has_objects(md->lsm);
3814 if (md->lsm != NULL)
3815 gen = md->lsm->lsm_layout_gen;
3818 DFID ": layout version change: %u -> %u\n",
3819 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3821 ll_layout_version_set(lli, gen);
3827 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3828 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3831 struct ll_sb_info *sbi = ll_i2sbi(inode);
3832 struct obd_capa *oc;
3833 struct ptlrpc_request *req;
3834 struct mdt_body *body;
3841 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3842 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3843 lock->l_lvb_data, lock->l_lvb_len);
3845 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3848 /* if layout lock was granted right away, the layout is returned
3849 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3850 * blocked and then granted via completion ast, we have to fetch
3851 * layout here. Please note that we can't use the LVB buffer in
3852 * completion AST because it doesn't have a large enough buffer */
3853 oc = ll_mdscapa_get(inode);
3854 rc = ll_get_default_mdsize(sbi, &lmmsize);
3856 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3857 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3863 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3865 GOTO(out, rc = -EPROTO);
3867 lmmsize = body->mbo_eadatasize;
3868 if (lmmsize == 0) /* empty layout */
3871 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3873 GOTO(out, rc = -EFAULT);
3875 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3876 if (lvbdata == NULL)
3877 GOTO(out, rc = -ENOMEM);
3879 memcpy(lvbdata, lmm, lmmsize);
3880 lock_res_and_lock(lock);
3881 if (lock->l_lvb_data != NULL)
3882 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3884 lock->l_lvb_data = lvbdata;
3885 lock->l_lvb_len = lmmsize;
3886 unlock_res_and_lock(lock);
3891 ptlrpc_req_finished(req);
3896 * Apply the layout to the inode. Layout lock is held and will be released
3899 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3900 struct inode *inode, __u32 *gen, bool reconf)
3902 struct ll_inode_info *lli = ll_i2info(inode);
3903 struct ll_sb_info *sbi = ll_i2sbi(inode);
3904 struct ldlm_lock *lock;
3905 struct lustre_md md = { NULL };
3906 struct cl_object_conf conf;
3909 bool wait_layout = false;
3912 LASSERT(lustre_handle_is_used(lockh));
3914 lock = ldlm_handle2lock(lockh);
3915 LASSERT(lock != NULL);
3916 LASSERT(ldlm_has_layout(lock));
3918 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3919 PFID(&lli->lli_fid), inode, reconf);
3921 /* in case this is a caching lock and reinstate with new inode */
3922 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3924 lock_res_and_lock(lock);
3925 lvb_ready = ldlm_is_lvb_ready(lock);
3926 unlock_res_and_lock(lock);
3927 /* checking lvb_ready is racy but this is okay. The worst case is
3928 * that multi processes may configure the file on the same time. */
3930 if (lvb_ready || !reconf) {
3933 /* layout_gen must be valid if layout lock is not
3934 * cancelled and stripe has already set */
3935 *gen = ll_layout_version_get(lli);
3941 rc = ll_layout_fetch(inode, lock);
3945 /* for layout lock, lmm is returned in lock's lvb.
3946 * lvb_data is immutable if the lock is held so it's safe to access it
3947 * without res lock. See the description in ldlm_lock_decref_internal()
3948 * for the condition to free lvb_data of layout lock */
3949 if (lock->l_lvb_data != NULL) {
3950 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3951 lock->l_lvb_data, lock->l_lvb_len);
3953 *gen = LL_LAYOUT_GEN_EMPTY;
3955 *gen = md.lsm->lsm_layout_gen;
3958 CERROR("%s: file "DFID" unpackmd error: %d\n",
3959 ll_get_fsname(inode->i_sb, NULL, 0),
3960 PFID(&lli->lli_fid), rc);
3966 /* set layout to file. Unlikely this will fail as old layout was
3967 * surely eliminated */
3968 memset(&conf, 0, sizeof conf);
3969 conf.coc_opc = OBJECT_CONF_SET;
3970 conf.coc_inode = inode;
3971 conf.coc_lock = lock;
3972 conf.u.coc_md = &md;
3973 rc = ll_layout_conf(inode, &conf);
3976 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3978 /* refresh layout failed, need to wait */
3979 wait_layout = rc == -EBUSY;
3983 LDLM_LOCK_PUT(lock);
3984 ldlm_lock_decref(lockh, mode);
3986 /* wait for IO to complete if it's still being used. */
3988 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3989 ll_get_fsname(inode->i_sb, NULL, 0),
3990 PFID(&lli->lli_fid), inode);
3992 memset(&conf, 0, sizeof conf);
3993 conf.coc_opc = OBJECT_CONF_WAIT;
3994 conf.coc_inode = inode;
3995 rc = ll_layout_conf(inode, &conf);
3999 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4000 ll_get_fsname(inode->i_sb, NULL, 0),
4001 PFID(&lli->lli_fid), rc);
4007 * This function checks if there exists a LAYOUT lock on the client side,
4008 * or enqueues it if it doesn't have one in cache.
4010 * This function will not hold layout lock so it may be revoked any time after
4011 * this function returns. Any operations depend on layout should be redone
4014 * This function should be called before lov_io_init() to get an uptodate
4015 * layout version, the caller should save the version number and after IO
4016 * is finished, this function should be called again to verify that layout
4017 * is not changed during IO time.
4019 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4021 struct ll_inode_info *lli = ll_i2info(inode);
4022 struct ll_sb_info *sbi = ll_i2sbi(inode);
4023 struct md_op_data *op_data;
4024 struct lookup_intent it;
4025 struct lustre_handle lockh;
4027 struct ldlm_enqueue_info einfo = {
4028 .ei_type = LDLM_IBITS,
4030 .ei_cb_bl = &ll_md_blocking_ast,
4031 .ei_cb_cp = &ldlm_completion_ast,
4036 *gen = ll_layout_version_get(lli);
4037 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4041 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4042 LASSERT(S_ISREG(inode->i_mode));
4044 /* take layout lock mutex to enqueue layout lock exclusively. */
4045 mutex_lock(&lli->lli_layout_mutex);
4048 /* mostly layout lock is caching on the local side, so try to match
4049 * it before grabbing layout lock mutex. */
4050 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4051 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4052 if (mode != 0) { /* hit cached lock */
4053 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4057 mutex_unlock(&lli->lli_layout_mutex);
4061 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4062 0, 0, LUSTRE_OPC_ANY, NULL);
4063 if (IS_ERR(op_data)) {
4064 mutex_unlock(&lli->lli_layout_mutex);
4065 RETURN(PTR_ERR(op_data));
4068 /* have to enqueue one */
4069 memset(&it, 0, sizeof(it));
4070 it.it_op = IT_LAYOUT;
4071 lockh.cookie = 0ULL;
4073 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4074 ll_get_fsname(inode->i_sb, NULL, 0),
4075 PFID(&lli->lli_fid), inode);
4077 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4078 if (it.d.lustre.it_data != NULL)
4079 ptlrpc_req_finished(it.d.lustre.it_data);
4080 it.d.lustre.it_data = NULL;
4082 ll_finish_md_op_data(op_data);
4084 mode = it.d.lustre.it_lock_mode;
4085 it.d.lustre.it_lock_mode = 0;
4086 ll_intent_drop_lock(&it);
4089 /* set lock data in case this is a new lock */
4090 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4091 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4095 mutex_unlock(&lli->lli_layout_mutex);
4101 * This function send a restore request to the MDT
4103 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4105 struct hsm_user_request *hur;
4109 len = sizeof(struct hsm_user_request) +
4110 sizeof(struct hsm_user_item);
4111 OBD_ALLOC(hur, len);
4115 hur->hur_request.hr_action = HUA_RESTORE;
4116 hur->hur_request.hr_archive_id = 0;
4117 hur->hur_request.hr_flags = 0;
4118 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4119 sizeof(hur->hur_user_item[0].hui_fid));
4120 hur->hur_user_item[0].hui_extent.offset = offset;
4121 hur->hur_user_item[0].hui_extent.length = length;
4122 hur->hur_request.hr_itemcount = 1;
4123 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,