4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include <linux/sched.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51 #include <lustre_ioctl.h>
53 #include "cl_object.h"
56 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
58 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
61 static enum llioc_iter
62 ll_iocontrol_call(struct inode *inode, struct file *file,
63 unsigned int cmd, unsigned long arg, int *rcp);
65 static struct ll_file_data *ll_file_data_get(void)
67 struct ll_file_data *fd;
69 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73 fd->fd_write_failed = false;
78 static void ll_file_data_put(struct ll_file_data *fd)
81 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
84 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
85 struct lustre_handle *fh)
87 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
88 op_data->op_attr.ia_mode = inode->i_mode;
89 op_data->op_attr.ia_atime = inode->i_atime;
90 op_data->op_attr.ia_mtime = inode->i_mtime;
91 op_data->op_attr.ia_ctime = inode->i_ctime;
92 op_data->op_attr.ia_size = i_size_read(inode);
93 op_data->op_attr_blocks = inode->i_blocks;
94 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
95 ll_inode_to_ext_flags(inode->i_flags);
96 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
98 op_data->op_handle = *fh;
99 op_data->op_capa1 = ll_mdscapa_get(inode);
101 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
102 op_data->op_bias |= MDS_DATA_MODIFIED;
106 * Closes the IO epoch and packs all the attributes into @op_data for
109 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
110 struct obd_client_handle *och)
114 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
115 ATTR_MTIME | ATTR_MTIME_SET |
116 ATTR_CTIME | ATTR_CTIME_SET;
118 if (!(och->och_flags & FMODE_WRITE))
121 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
124 ll_ioepoch_close(inode, op_data, &och, 0);
127 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
128 ll_prep_md_op_data(op_data, inode, NULL, NULL,
129 0, 0, LUSTRE_OPC_ANY, NULL);
133 static int ll_close_inode_openhandle(struct obd_export *md_exp,
135 struct obd_client_handle *och,
136 const __u64 *data_version)
138 struct obd_export *exp = ll_i2mdexp(inode);
139 struct md_op_data *op_data;
140 struct ptlrpc_request *req = NULL;
141 struct obd_device *obd = class_exp2obd(exp);
148 * XXX: in case of LMV, is this correct to access
151 CERROR("Invalid MDC connection handle "LPX64"\n",
152 ll_i2mdexp(inode)->exp_handle.h_cookie);
156 OBD_ALLOC_PTR(op_data);
158 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
160 ll_prepare_close(inode, op_data, och);
161 if (data_version != NULL) {
162 /* Pass in data_version implies release. */
163 op_data->op_bias |= MDS_HSM_RELEASE;
164 op_data->op_data_version = *data_version;
165 op_data->op_lease_handle = och->och_lease_handle;
166 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
171 /* This close must have the epoch closed. */
172 LASSERT(epoch_close);
173 /* MDS has instructed us to obtain Size-on-MDS attribute from
174 * OSTs and send setattr to back to MDS. */
175 rc = ll_som_update(inode, op_data);
177 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
178 " failed: rc = %d\n",
179 ll_i2mdexp(inode)->exp_obd->obd_name,
180 PFID(ll_inode2fid(inode)), rc);
184 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
185 ll_i2mdexp(inode)->exp_obd->obd_name,
186 PFID(ll_inode2fid(inode)), rc);
189 /* DATA_MODIFIED flag was successfully sent on close, cancel data
190 * modification flag. */
191 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
192 struct ll_inode_info *lli = ll_i2info(inode);
194 spin_lock(&lli->lli_lock);
195 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
196 spin_unlock(&lli->lli_lock);
200 rc = ll_objects_destroy(req, inode);
202 CERROR("%s: inode "DFID
203 " ll_objects destroy: rc = %d\n",
204 ll_i2mdexp(inode)->exp_obd->obd_name,
205 PFID(ll_inode2fid(inode)), rc);
208 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
209 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
215 ll_finish_md_op_data(op_data);
219 if (exp_connect_som(exp) && !epoch_close &&
220 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
221 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
223 md_clear_open_replay_data(md_exp, och);
224 /* Free @och if it is not waiting for DONE_WRITING. */
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 if (req) /* This is close request */
229 ptlrpc_req_finished(req);
233 int ll_md_real_close(struct inode *inode, fmode_t fmode)
235 struct ll_inode_info *lli = ll_i2info(inode);
236 struct obd_client_handle **och_p;
237 struct obd_client_handle *och;
242 if (fmode & FMODE_WRITE) {
243 och_p = &lli->lli_mds_write_och;
244 och_usecount = &lli->lli_open_fd_write_count;
245 } else if (fmode & FMODE_EXEC) {
246 och_p = &lli->lli_mds_exec_och;
247 och_usecount = &lli->lli_open_fd_exec_count;
249 LASSERT(fmode & FMODE_READ);
250 och_p = &lli->lli_mds_read_och;
251 och_usecount = &lli->lli_open_fd_read_count;
254 mutex_lock(&lli->lli_och_mutex);
255 if (*och_usecount > 0) {
256 /* There are still users of this handle, so skip
258 mutex_unlock(&lli->lli_och_mutex);
264 mutex_unlock(&lli->lli_och_mutex);
267 /* There might be a race and this handle may already
269 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
276 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
284 /* clear group lock, if present */
285 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
286 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
288 if (fd->fd_lease_och != NULL) {
291 /* Usually the lease is not released when the
292 * application crashed, we need to release here. */
293 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
294 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
295 PFID(&lli->lli_fid), rc, lease_broken);
297 fd->fd_lease_och = NULL;
300 if (fd->fd_och != NULL) {
301 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
306 /* Let's see if we have good enough OPEN lock on the file and if
307 we can skip talking to MDS */
308 if (file->f_dentry->d_inode) { /* Can this ever be false? */
310 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
311 struct lustre_handle lockh;
312 struct inode *inode = file->f_dentry->d_inode;
313 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
315 mutex_lock(&lli->lli_och_mutex);
316 if (fd->fd_omode & FMODE_WRITE) {
318 LASSERT(lli->lli_open_fd_write_count);
319 lli->lli_open_fd_write_count--;
320 } else if (fd->fd_omode & FMODE_EXEC) {
322 LASSERT(lli->lli_open_fd_exec_count);
323 lli->lli_open_fd_exec_count--;
326 LASSERT(lli->lli_open_fd_read_count);
327 lli->lli_open_fd_read_count--;
329 mutex_unlock(&lli->lli_och_mutex);
331 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
332 LDLM_IBITS, &policy, lockmode,
334 rc = ll_md_real_close(file->f_dentry->d_inode,
338 CERROR("released file has negative dentry: file = %p, "
339 "dentry = %p, name = %s\n",
340 file, file->f_dentry, file->f_dentry->d_name.name);
344 LUSTRE_FPRIVATE(file) = NULL;
345 ll_file_data_put(fd);
346 ll_capa_close(inode);
351 /* While this returns an error code, fput() the caller does not, so we need
352 * to make every effort to clean up all of our state here. Also, applications
353 * rarely check close errors and even if an error is returned they will not
354 * re-try the close call.
356 int ll_file_release(struct inode *inode, struct file *file)
358 struct ll_file_data *fd;
359 struct ll_sb_info *sbi = ll_i2sbi(inode);
360 struct ll_inode_info *lli = ll_i2info(inode);
364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
365 PFID(ll_inode2fid(inode)), inode);
367 #ifdef CONFIG_FS_POSIX_ACL
368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
369 inode == inode->i_sb->s_root->d_inode) {
370 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
373 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
374 fd->fd_flags &= ~LL_FILE_RMTACL;
375 rct_del(&sbi->ll_rct, current_pid());
376 et_search_free(&sbi->ll_et, current_pid());
381 if (inode->i_sb->s_root != file->f_dentry)
382 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
383 fd = LUSTRE_FPRIVATE(file);
386 /* The last ref on @file, maybe not the the owner pid of statahead.
387 * Different processes can open the same dir, "ll_opendir_key" means:
388 * it is me that should stop the statahead thread. */
389 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
390 lli->lli_opendir_pid != 0)
391 ll_stop_statahead(inode, lli->lli_opendir_key);
393 if (inode->i_sb->s_root == file->f_dentry) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 if (!S_ISDIR(inode->i_mode)) {
400 if (lli->lli_clob != NULL)
401 lov_read_and_clear_async_rc(lli->lli_clob);
402 lli->lli_async_rc = 0;
405 rc = ll_md_close(sbi->ll_md_exp, inode, file);
407 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
408 libcfs_debug_dumplog();
413 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
414 struct lookup_intent *itp)
416 struct dentry *de = file->f_dentry;
417 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
418 struct dentry *parent = de->d_parent;
419 const char *name = NULL;
421 struct md_op_data *op_data;
422 struct ptlrpc_request *req = NULL;
426 LASSERT(parent != NULL);
427 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
429 /* if server supports open-by-fid, or file name is invalid, don't pack
430 * name in open request */
431 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
432 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
433 name = de->d_name.name;
434 len = de->d_name.len;
437 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
438 name, len, 0, LUSTRE_OPC_ANY, NULL);
440 RETURN(PTR_ERR(op_data));
441 op_data->op_data = lmm;
442 op_data->op_data_size = lmmsize;
444 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
445 &ll_md_blocking_ast, 0);
446 ll_finish_md_op_data(op_data);
448 /* reason for keep own exit path - don`t flood log
449 * with messages with -ESTALE errors.
451 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
452 it_open_error(DISP_OPEN_OPEN, itp))
454 ll_release_openhandle(de, itp);
458 if (it_disposition(itp, DISP_LOOKUP_NEG))
459 GOTO(out, rc = -ENOENT);
461 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
462 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
463 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
467 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
468 if (!rc && itp->d.lustre.it_lock_mode)
469 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
472 ptlrpc_req_finished(req);
473 ll_intent_drop_lock(itp);
479 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
480 * not believe attributes if a few ioepoch holders exist. Attributes for
481 * previous ioepoch if new one is opened are also skipped by MDS.
483 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
485 if (ioepoch && lli->lli_ioepoch != ioepoch) {
486 lli->lli_ioepoch = ioepoch;
487 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
488 ioepoch, PFID(&lli->lli_fid));
492 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
493 struct obd_client_handle *och)
495 struct ptlrpc_request *req = it->d.lustre.it_data;
496 struct mdt_body *body;
498 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
499 och->och_fh = body->mbo_handle;
500 och->och_fid = body->mbo_fid1;
501 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
502 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
503 och->och_flags = it->it_flags;
505 return md_set_open_replay_data(md_exp, och, it);
508 static int ll_local_open(struct file *file, struct lookup_intent *it,
509 struct ll_file_data *fd, struct obd_client_handle *och)
511 struct inode *inode = file->f_dentry->d_inode;
512 struct ll_inode_info *lli = ll_i2info(inode);
515 LASSERT(!LUSTRE_FPRIVATE(file));
520 struct ptlrpc_request *req = it->d.lustre.it_data;
521 struct mdt_body *body;
524 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
528 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
529 ll_ioepoch_open(lli, body->mbo_ioepoch);
532 LUSTRE_FPRIVATE(file) = fd;
533 ll_readahead_init(inode, &fd->fd_ras);
534 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
536 /* ll_cl_context initialize */
537 rwlock_init(&fd->fd_lock);
538 INIT_LIST_HEAD(&fd->fd_lccs);
543 /* Open a file, and (for the very first open) create objects on the OSTs at
544 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
545 * creation or open until ll_lov_setstripe() ioctl is called.
547 * If we already have the stripe MD locally then we don't request it in
548 * md_open(), by passing a lmm_size = 0.
550 * It is up to the application to ensure no other processes open this file
551 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
552 * used. We might be able to avoid races of that sort by getting lli_open_sem
553 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
554 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
556 int ll_file_open(struct inode *inode, struct file *file)
558 struct ll_inode_info *lli = ll_i2info(inode);
559 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
560 .it_flags = file->f_flags };
561 struct obd_client_handle **och_p = NULL;
562 __u64 *och_usecount = NULL;
563 struct ll_file_data *fd;
564 int rc = 0, opendir_set = 0;
567 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
568 PFID(ll_inode2fid(inode)), inode, file->f_flags);
570 it = file->private_data; /* XXX: compat macro */
571 file->private_data = NULL; /* prevent ll_local_open assertion */
573 fd = ll_file_data_get();
575 GOTO(out_openerr, rc = -ENOMEM);
578 if (S_ISDIR(inode->i_mode)) {
579 spin_lock(&lli->lli_sa_lock);
580 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
581 lli->lli_opendir_pid == 0) {
582 lli->lli_opendir_key = fd;
583 lli->lli_opendir_pid = current_pid();
586 spin_unlock(&lli->lli_sa_lock);
589 if (inode->i_sb->s_root == file->f_dentry) {
590 LUSTRE_FPRIVATE(file) = fd;
594 if (!it || !it->d.lustre.it_disposition) {
595 /* Convert f_flags into access mode. We cannot use file->f_mode,
596 * because everything but O_ACCMODE mask was stripped from
598 if ((oit.it_flags + 1) & O_ACCMODE)
600 if (file->f_flags & O_TRUNC)
601 oit.it_flags |= FMODE_WRITE;
603 /* kernel only call f_op->open in dentry_open. filp_open calls
604 * dentry_open after call to open_namei that checks permissions.
605 * Only nfsd_open call dentry_open directly without checking
606 * permissions and because of that this code below is safe. */
607 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
608 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
610 /* We do not want O_EXCL here, presumably we opened the file
611 * already? XXX - NFS implications? */
612 oit.it_flags &= ~O_EXCL;
614 /* bug20584, if "it_flags" contains O_CREAT, the file will be
615 * created if necessary, then "IT_CREAT" should be set to keep
616 * consistent with it */
617 if (oit.it_flags & O_CREAT)
618 oit.it_op |= IT_CREAT;
624 /* Let's see if we have file open on MDS already. */
625 if (it->it_flags & FMODE_WRITE) {
626 och_p = &lli->lli_mds_write_och;
627 och_usecount = &lli->lli_open_fd_write_count;
628 } else if (it->it_flags & FMODE_EXEC) {
629 och_p = &lli->lli_mds_exec_och;
630 och_usecount = &lli->lli_open_fd_exec_count;
632 och_p = &lli->lli_mds_read_och;
633 och_usecount = &lli->lli_open_fd_read_count;
636 mutex_lock(&lli->lli_och_mutex);
637 if (*och_p) { /* Open handle is present */
638 if (it_disposition(it, DISP_OPEN_OPEN)) {
639 /* Well, there's extra open request that we do not need,
640 let's close it somehow. This will decref request. */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 mutex_unlock(&lli->lli_och_mutex);
644 GOTO(out_openerr, rc);
647 ll_release_openhandle(file->f_dentry, it);
651 rc = ll_local_open(file, it, fd, NULL);
654 mutex_unlock(&lli->lli_och_mutex);
655 GOTO(out_openerr, rc);
658 LASSERT(*och_usecount == 0);
659 if (!it->d.lustre.it_disposition) {
660 /* We cannot just request lock handle now, new ELC code
661 means that one of other OPEN locks for this file
662 could be cancelled, and since blocking ast handler
663 would attempt to grab och_mutex as well, that would
664 result in a deadlock */
665 mutex_unlock(&lli->lli_och_mutex);
667 * Normally called under two situations:
669 * 2. revalidate with IT_OPEN (revalidate doesn't
670 * execute this intent any more).
672 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
674 * Always specify MDS_OPEN_BY_FID because we don't want
675 * to get file with different fid.
677 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
678 rc = ll_intent_file_open(file, NULL, 0, it);
680 GOTO(out_openerr, rc);
684 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
686 GOTO(out_och_free, rc = -ENOMEM);
690 /* md_intent_lock() didn't get a request ref if there was an
691 * open error, so don't do cleanup on the request here
693 /* XXX (green): Should not we bail out on any error here, not
694 * just open error? */
695 rc = it_open_error(DISP_OPEN_OPEN, it);
697 GOTO(out_och_free, rc);
699 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
700 "inode %p: disposition %x, status %d\n", inode,
701 it_disposition(it, ~0), it->d.lustre.it_status);
703 rc = ll_local_open(file, it, fd, *och_p);
705 GOTO(out_och_free, rc);
707 mutex_unlock(&lli->lli_och_mutex);
710 /* Must do this outside lli_och_mutex lock to prevent deadlock where
711 different kind of OPEN lock for this same inode gets cancelled
712 by ldlm_cancel_lru */
713 if (!S_ISREG(inode->i_mode))
714 GOTO(out_och_free, rc);
718 if (!lli->lli_has_smd &&
719 (cl_is_lov_delay_create(file->f_flags) ||
720 (file->f_mode & FMODE_WRITE) == 0)) {
721 CDEBUG(D_INODE, "object creation was delayed\n");
722 GOTO(out_och_free, rc);
724 cl_lov_delay_create_clear(&file->f_flags);
725 GOTO(out_och_free, rc);
729 if (och_p && *och_p) {
730 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
731 *och_p = NULL; /* OBD_FREE writes some magic there */
734 mutex_unlock(&lli->lli_och_mutex);
737 if (opendir_set != 0)
738 ll_stop_statahead(inode, lli->lli_opendir_key);
740 ll_file_data_put(fd);
742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
745 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
746 ptlrpc_req_finished(it->d.lustre.it_data);
747 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
753 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
754 struct ldlm_lock_desc *desc, void *data, int flag)
757 struct lustre_handle lockh;
761 case LDLM_CB_BLOCKING:
762 ldlm_lock2handle(lock, &lockh);
763 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
765 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
769 case LDLM_CB_CANCELING:
777 * Acquire a lease and open the file.
779 static struct obd_client_handle *
780 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
783 struct lookup_intent it = { .it_op = IT_OPEN };
784 struct ll_sb_info *sbi = ll_i2sbi(inode);
785 struct md_op_data *op_data;
786 struct ptlrpc_request *req = NULL;
787 struct lustre_handle old_handle = { 0 };
788 struct obd_client_handle *och = NULL;
793 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
794 RETURN(ERR_PTR(-EINVAL));
797 struct ll_inode_info *lli = ll_i2info(inode);
798 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
799 struct obd_client_handle **och_p;
802 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
803 RETURN(ERR_PTR(-EPERM));
805 /* Get the openhandle of the file */
807 mutex_lock(&lli->lli_och_mutex);
808 if (fd->fd_lease_och != NULL) {
809 mutex_unlock(&lli->lli_och_mutex);
813 if (fd->fd_och == NULL) {
814 if (file->f_mode & FMODE_WRITE) {
815 LASSERT(lli->lli_mds_write_och != NULL);
816 och_p = &lli->lli_mds_write_och;
817 och_usecount = &lli->lli_open_fd_write_count;
819 LASSERT(lli->lli_mds_read_och != NULL);
820 och_p = &lli->lli_mds_read_och;
821 och_usecount = &lli->lli_open_fd_read_count;
823 if (*och_usecount == 1) {
830 mutex_unlock(&lli->lli_och_mutex);
831 if (rc < 0) /* more than 1 opener */
834 LASSERT(fd->fd_och != NULL);
835 old_handle = fd->fd_och->och_fh;
840 RETURN(ERR_PTR(-ENOMEM));
842 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
843 LUSTRE_OPC_ANY, NULL);
845 GOTO(out, rc = PTR_ERR(op_data));
847 /* To tell the MDT this openhandle is from the same owner */
848 op_data->op_handle = old_handle;
850 it.it_flags = fmode | open_flags;
851 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
852 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
853 &ll_md_blocking_lease_ast,
854 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
855 * it can be cancelled which may mislead applications that the lease is
857 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
858 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
859 * doesn't deal with openhandle, so normal openhandle will be leaked. */
860 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
861 ll_finish_md_op_data(op_data);
862 ptlrpc_req_finished(req);
864 GOTO(out_release_it, rc);
866 if (it_disposition(&it, DISP_LOOKUP_NEG))
867 GOTO(out_release_it, rc = -ENOENT);
869 rc = it_open_error(DISP_OPEN_OPEN, &it);
871 GOTO(out_release_it, rc);
873 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
874 ll_och_fill(sbi->ll_md_exp, &it, och);
876 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
877 GOTO(out_close, rc = -EOPNOTSUPP);
879 /* already get lease, handle lease lock */
880 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
881 if (it.d.lustre.it_lock_mode == 0 ||
882 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
883 /* open lock must return for lease */
884 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
885 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
886 it.d.lustre.it_lock_bits);
887 GOTO(out_close, rc = -EPROTO);
890 ll_intent_release(&it);
894 /* Cancel open lock */
895 if (it.d.lustre.it_lock_mode != 0) {
896 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
897 it.d.lustre.it_lock_mode);
898 it.d.lustre.it_lock_mode = 0;
899 och->och_lease_handle.cookie = 0ULL;
901 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
903 CERROR("%s: error closing file "DFID": %d\n",
904 ll_get_fsname(inode->i_sb, NULL, 0),
905 PFID(&ll_i2info(inode)->lli_fid), rc2);
906 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
908 ll_intent_release(&it);
916 * Release lease and close the file.
917 * It will check if the lease has ever broken.
919 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
922 struct ldlm_lock *lock;
923 bool cancelled = true;
927 lock = ldlm_handle2lock(&och->och_lease_handle);
929 lock_res_and_lock(lock);
930 cancelled = ldlm_is_cancel(lock);
931 unlock_res_and_lock(lock);
935 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
936 PFID(&ll_i2info(inode)->lli_fid), cancelled);
939 ldlm_cli_cancel(&och->och_lease_handle, 0);
940 if (lease_broken != NULL)
941 *lease_broken = cancelled;
943 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
948 /* Fills the obdo with the attributes for the lsm */
949 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
950 struct obd_capa *capa, struct obdo *obdo,
951 __u64 ioepoch, int dv_flags)
953 struct ptlrpc_request_set *set;
954 struct obd_info oinfo = { { { 0 } } };
959 LASSERT(lsm != NULL);
963 oinfo.oi_oa->o_oi = lsm->lsm_oi;
964 oinfo.oi_oa->o_mode = S_IFREG;
965 oinfo.oi_oa->o_ioepoch = ioepoch;
966 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
967 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
968 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
969 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
970 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
971 OBD_MD_FLDATAVERSION;
972 oinfo.oi_capa = capa;
973 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
974 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
975 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
976 if (dv_flags & LL_DV_WR_FLUSH)
977 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
980 set = ptlrpc_prep_set();
982 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
985 rc = obd_getattr_async(exp, &oinfo, set);
987 rc = ptlrpc_set_wait(set);
988 ptlrpc_set_destroy(set);
991 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
992 OBD_MD_FLATIME | OBD_MD_FLMTIME |
993 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
994 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
995 if (dv_flags & LL_DV_WR_FLUSH &&
996 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
997 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
1004 * Performs the getattr on the inode and updates its fields.
1005 * If @sync != 0, perform the getattr under the server-side lock.
1007 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
1008 __u64 ioepoch, int sync)
1010 struct obd_capa *capa = ll_mdscapa_get(inode);
1011 struct lov_stripe_md *lsm;
1015 lsm = ccc_inode_lsm_get(inode);
1016 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1017 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1020 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1022 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1023 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1024 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1025 (unsigned long long)inode->i_blocks,
1026 (unsigned long)ll_inode_blksize(inode));
1028 ccc_inode_lsm_put(inode, lsm);
1032 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1034 struct ll_inode_info *lli = ll_i2info(inode);
1035 struct cl_object *obj = lli->lli_clob;
1036 struct cl_attr *attr = ccc_env_thread_attr(env);
1042 ll_inode_size_lock(inode);
1043 /* merge timestamps the most recently obtained from mds with
1044 timestamps obtained from osts */
1045 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1046 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1047 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1048 inode_init_lvb(inode, &lvb);
1050 cl_object_attr_lock(obj);
1051 rc = cl_object_attr_get(env, obj, attr);
1052 cl_object_attr_unlock(obj);
1055 if (lvb.lvb_atime < attr->cat_atime)
1056 lvb.lvb_atime = attr->cat_atime;
1057 if (lvb.lvb_ctime < attr->cat_ctime)
1058 lvb.lvb_ctime = attr->cat_ctime;
1059 if (lvb.lvb_mtime < attr->cat_mtime)
1060 lvb.lvb_mtime = attr->cat_mtime;
1062 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1063 PFID(&lli->lli_fid), attr->cat_size);
1064 cl_isize_write_nolock(inode, attr->cat_size);
1066 inode->i_blocks = attr->cat_blocks;
1068 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1069 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1070 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1072 ll_inode_size_unlock(inode);
1077 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1080 struct obdo obdo = { 0 };
1083 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1085 st->st_size = obdo.o_size;
1086 st->st_blocks = obdo.o_blocks;
1087 st->st_mtime = obdo.o_mtime;
1088 st->st_atime = obdo.o_atime;
1089 st->st_ctime = obdo.o_ctime;
1094 static bool file_is_noatime(const struct file *file)
1096 const struct vfsmount *mnt = file->f_path.mnt;
1097 const struct inode *inode = file->f_path.dentry->d_inode;
1099 /* Adapted from file_accessed() and touch_atime().*/
1100 if (file->f_flags & O_NOATIME)
1103 if (inode->i_flags & S_NOATIME)
1106 if (IS_NOATIME(inode))
1109 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1112 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1115 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1121 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1123 struct inode *inode = file->f_dentry->d_inode;
1125 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1127 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1128 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1129 file->f_flags & O_DIRECT ||
1132 io->ci_obj = ll_i2info(inode)->lli_clob;
1133 io->ci_lockreq = CILR_MAYBE;
1134 if (ll_file_nolock(file)) {
1135 io->ci_lockreq = CILR_NEVER;
1136 io->ci_no_srvlock = 1;
1137 } else if (file->f_flags & O_APPEND) {
1138 io->ci_lockreq = CILR_MANDATORY;
1141 io->ci_noatime = file_is_noatime(file);
1145 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1146 struct file *file, enum cl_io_type iot,
1147 loff_t *ppos, size_t count)
1149 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1150 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1155 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1156 file->f_dentry->d_name.name, iot, *ppos, count);
1159 io = ccc_env_thread_io(env);
1160 ll_io_init(io, file, iot == CIT_WRITE);
1162 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1163 struct vvp_io *vio = vvp_env_io(env);
1164 struct ccc_io *cio = ccc_env_io(env);
1165 int write_mutex_locked = 0;
1167 cio->cui_fd = LUSTRE_FPRIVATE(file);
1168 vio->cui_io_subtype = args->via_io_subtype;
1170 switch (vio->cui_io_subtype) {
1172 cio->cui_iov = args->u.normal.via_iov;
1173 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1174 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1175 cio->cui_iocb = args->u.normal.via_iocb;
1176 if ((iot == CIT_WRITE) &&
1177 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1178 if (mutex_lock_interruptible(&lli->
1180 GOTO(out, result = -ERESTARTSYS);
1181 write_mutex_locked = 1;
1183 down_read(&lli->lli_trunc_sem);
1186 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1187 vio->u.splice.cui_flags = args->u.splice.via_flags;
1190 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1194 ll_cl_add(file, env, io);
1195 result = cl_io_loop(env, io);
1196 ll_cl_remove(file, env);
1198 if (args->via_io_subtype == IO_NORMAL)
1199 up_read(&lli->lli_trunc_sem);
1200 if (write_mutex_locked)
1201 mutex_unlock(&lli->lli_write_mutex);
1203 /* cl_io_rw_init() handled IO */
1204 result = io->ci_result;
1207 if (io->ci_nob > 0) {
1208 result = io->ci_nob;
1209 *ppos = io->u.ci_wr.wr.crw_pos;
1213 cl_io_fini(env, io);
1214 /* If any bit been read/written (result != 0), we just return
1215 * short read/write instead of restart io. */
1216 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1217 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1218 iot == CIT_READ ? "read" : "write",
1219 file->f_dentry->d_name.name, *ppos, count);
1220 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1224 if (iot == CIT_READ) {
1226 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1227 LPROC_LL_READ_BYTES, result);
1228 } else if (iot == CIT_WRITE) {
1230 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1231 LPROC_LL_WRITE_BYTES, result);
1232 fd->fd_write_failed = false;
1233 } else if (result != -ERESTARTSYS) {
1234 fd->fd_write_failed = true;
1237 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1244 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1246 static int ll_file_get_iov_count(const struct iovec *iov,
1247 unsigned long *nr_segs, size_t *count)
1252 for (seg = 0; seg < *nr_segs; seg++) {
1253 const struct iovec *iv = &iov[seg];
1256 * If any segment has a negative length, or the cumulative
1257 * length ever wraps negative then return -EINVAL.
1260 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1262 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1267 cnt -= iv->iov_len; /* This segment is no good */
1274 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1275 unsigned long nr_segs, loff_t pos)
1278 struct vvp_io_args *args;
1284 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1288 env = cl_env_get(&refcheck);
1290 RETURN(PTR_ERR(env));
1292 args = vvp_env_args(env, IO_NORMAL);
1293 args->u.normal.via_iov = (struct iovec *)iov;
1294 args->u.normal.via_nrsegs = nr_segs;
1295 args->u.normal.via_iocb = iocb;
1297 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1298 &iocb->ki_pos, count);
1299 cl_env_put(env, &refcheck);
1303 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1307 struct iovec *local_iov;
1308 struct kiocb *kiocb;
1313 env = cl_env_get(&refcheck);
1315 RETURN(PTR_ERR(env));
1317 local_iov = &vvp_env_info(env)->vti_local_iov;
1318 kiocb = &vvp_env_info(env)->vti_kiocb;
1319 local_iov->iov_base = (void __user *)buf;
1320 local_iov->iov_len = count;
1321 init_sync_kiocb(kiocb, file);
1322 kiocb->ki_pos = *ppos;
1323 #ifdef HAVE_KIOCB_KI_LEFT
1324 kiocb->ki_left = count;
1326 kiocb->ki_nbytes = count;
1329 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1330 *ppos = kiocb->ki_pos;
1332 cl_env_put(env, &refcheck);
1337 * Write to a file (through the page cache).
1340 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1341 unsigned long nr_segs, loff_t pos)
1344 struct vvp_io_args *args;
1350 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1354 env = cl_env_get(&refcheck);
1356 RETURN(PTR_ERR(env));
1358 args = vvp_env_args(env, IO_NORMAL);
1359 args->u.normal.via_iov = (struct iovec *)iov;
1360 args->u.normal.via_nrsegs = nr_segs;
1361 args->u.normal.via_iocb = iocb;
1363 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1364 &iocb->ki_pos, count);
1365 cl_env_put(env, &refcheck);
1369 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1373 struct iovec *local_iov;
1374 struct kiocb *kiocb;
1379 env = cl_env_get(&refcheck);
1381 RETURN(PTR_ERR(env));
1383 local_iov = &vvp_env_info(env)->vti_local_iov;
1384 kiocb = &vvp_env_info(env)->vti_kiocb;
1385 local_iov->iov_base = (void __user *)buf;
1386 local_iov->iov_len = count;
1387 init_sync_kiocb(kiocb, file);
1388 kiocb->ki_pos = *ppos;
1389 #ifdef HAVE_KIOCB_KI_LEFT
1390 kiocb->ki_left = count;
1392 kiocb->ki_nbytes = count;
1395 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1396 *ppos = kiocb->ki_pos;
1398 cl_env_put(env, &refcheck);
1403 * Send file content (through pagecache) somewhere with helper
1405 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1406 struct pipe_inode_info *pipe, size_t count,
1410 struct vvp_io_args *args;
1415 env = cl_env_get(&refcheck);
1417 RETURN(PTR_ERR(env));
1419 args = vvp_env_args(env, IO_SPLICE);
1420 args->u.splice.via_pipe = pipe;
1421 args->u.splice.via_flags = flags;
1423 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1424 cl_env_put(env, &refcheck);
1428 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1431 struct obd_export *exp = ll_i2dtexp(inode);
1432 struct obd_trans_info oti = { 0 };
1433 struct obdo *oa = NULL;
1436 struct lov_stripe_md *lsm = NULL, *lsm2;
1443 lsm = ccc_inode_lsm_get(inode);
1444 if (!lsm_has_objects(lsm))
1445 GOTO(out, rc = -ENOENT);
1447 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1448 (lsm->lsm_stripe_count));
1450 OBD_ALLOC_LARGE(lsm2, lsm_size);
1452 GOTO(out, rc = -ENOMEM);
1455 oa->o_nlink = ost_idx;
1456 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1457 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1458 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1459 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1460 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1461 memcpy(lsm2, lsm, lsm_size);
1462 ll_inode_size_lock(inode);
1463 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1464 ll_inode_size_unlock(inode);
1466 OBD_FREE_LARGE(lsm2, lsm_size);
1469 ccc_inode_lsm_put(inode, lsm);
1474 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1476 struct ll_recreate_obj ucreat;
1480 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1483 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1487 ostid_set_seq_mdt0(&oi);
1488 ostid_set_id(&oi, ucreat.lrc_id);
1489 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1492 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1499 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1502 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1505 fid_to_ostid(&fid, &oi);
1506 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1507 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1510 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1511 __u64 flags, struct lov_user_md *lum,
1514 struct lov_stripe_md *lsm = NULL;
1515 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1519 lsm = ccc_inode_lsm_get(inode);
1521 ccc_inode_lsm_put(inode, lsm);
1522 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1523 PFID(ll_inode2fid(inode)));
1524 GOTO(out, rc = -EEXIST);
1527 ll_inode_size_lock(inode);
1528 oit.it_flags |= MDS_OPEN_BY_FID;
1529 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1531 GOTO(out_unlock, rc);
1532 rc = oit.d.lustre.it_status;
1534 GOTO(out_req_free, rc);
1536 ll_release_openhandle(file->f_dentry, &oit);
1539 ll_inode_size_unlock(inode);
1540 ll_intent_release(&oit);
1541 ccc_inode_lsm_put(inode, lsm);
1543 cl_lov_delay_create_clear(&file->f_flags);
1546 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1550 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1551 struct lov_mds_md **lmmp, int *lmm_size,
1552 struct ptlrpc_request **request)
1554 struct ll_sb_info *sbi = ll_i2sbi(inode);
1555 struct mdt_body *body;
1556 struct lov_mds_md *lmm = NULL;
1557 struct ptlrpc_request *req = NULL;
1558 struct md_op_data *op_data;
1561 rc = ll_get_default_mdsize(sbi, &lmmsize);
1565 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1566 strlen(filename), lmmsize,
1567 LUSTRE_OPC_ANY, NULL);
1568 if (IS_ERR(op_data))
1569 RETURN(PTR_ERR(op_data));
1571 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1572 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1573 ll_finish_md_op_data(op_data);
1575 CDEBUG(D_INFO, "md_getattr_name failed "
1576 "on %s: rc %d\n", filename, rc);
1580 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1581 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1583 lmmsize = body->mbo_eadatasize;
1585 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1587 GOTO(out, rc = -ENODATA);
1590 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1591 LASSERT(lmm != NULL);
1593 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1594 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1595 GOTO(out, rc = -EPROTO);
1599 * This is coming from the MDS, so is probably in
1600 * little endian. We convert it to host endian before
1601 * passing it to userspace.
1603 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1606 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1607 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1610 /* if function called for directory - we should
1611 * avoid swab not existent lsm objects */
1612 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1613 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1614 if (S_ISREG(body->mbo_mode))
1615 lustre_swab_lov_user_md_objects(
1616 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1618 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1619 lustre_swab_lov_user_md_v3(
1620 (struct lov_user_md_v3 *)lmm);
1621 if (S_ISREG(body->mbo_mode))
1622 lustre_swab_lov_user_md_objects(
1623 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1630 *lmm_size = lmmsize;
1635 static int ll_lov_setea(struct inode *inode, struct file *file,
1638 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1639 struct lov_user_md *lump;
1640 int lum_size = sizeof(struct lov_user_md) +
1641 sizeof(struct lov_user_ost_data);
1645 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1648 OBD_ALLOC_LARGE(lump, lum_size);
1652 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1653 OBD_FREE_LARGE(lump, lum_size);
1657 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1659 OBD_FREE_LARGE(lump, lum_size);
1663 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1666 struct lov_user_md_v3 lumv3;
1667 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1668 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1669 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1671 __u64 flags = FMODE_WRITE;
1674 /* first try with v1 which is smaller than v3 */
1675 lum_size = sizeof(struct lov_user_md_v1);
1676 if (copy_from_user(lumv1, lumv1p, lum_size))
1679 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1680 lum_size = sizeof(struct lov_user_md_v3);
1681 if (copy_from_user(&lumv3, lumv3p, lum_size))
1685 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1687 struct lov_stripe_md *lsm;
1690 put_user(0, &lumv1p->lmm_stripe_count);
1692 ll_layout_refresh(inode, &gen);
1693 lsm = ccc_inode_lsm_get(inode);
1694 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1695 0, lsm, (void *)arg);
1696 ccc_inode_lsm_put(inode, lsm);
1701 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1703 struct lov_stripe_md *lsm;
1707 lsm = ccc_inode_lsm_get(inode);
1709 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1711 ccc_inode_lsm_put(inode, lsm);
1716 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1718 struct ll_inode_info *lli = ll_i2info(inode);
1719 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1720 struct ccc_grouplock grouplock;
1724 if (ll_file_nolock(file))
1725 RETURN(-EOPNOTSUPP);
1727 spin_lock(&lli->lli_lock);
1728 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1729 CWARN("group lock already existed with gid %lu\n",
1730 fd->fd_grouplock.cg_gid);
1731 spin_unlock(&lli->lli_lock);
1734 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1735 spin_unlock(&lli->lli_lock);
1737 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1738 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1742 spin_lock(&lli->lli_lock);
1743 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1744 spin_unlock(&lli->lli_lock);
1745 CERROR("another thread just won the race\n");
1746 cl_put_grouplock(&grouplock);
1750 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1751 fd->fd_grouplock = grouplock;
1752 spin_unlock(&lli->lli_lock);
1754 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1758 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1760 struct ll_inode_info *lli = ll_i2info(inode);
1761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1762 struct ccc_grouplock grouplock;
1765 spin_lock(&lli->lli_lock);
1766 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1767 spin_unlock(&lli->lli_lock);
1768 CWARN("no group lock held\n");
1771 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1773 if (fd->fd_grouplock.cg_gid != arg) {
1774 CWARN("group lock %lu doesn't match current id %lu\n",
1775 arg, fd->fd_grouplock.cg_gid);
1776 spin_unlock(&lli->lli_lock);
1780 grouplock = fd->fd_grouplock;
1781 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1782 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1783 spin_unlock(&lli->lli_lock);
1785 cl_put_grouplock(&grouplock);
1786 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1791 * Close inode open handle
1793 * \param dentry [in] dentry which contains the inode
1794 * \param it [in,out] intent which contains open info and result
1797 * \retval <0 failure
1799 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1801 struct inode *inode = dentry->d_inode;
1802 struct obd_client_handle *och;
1808 /* Root ? Do nothing. */
1809 if (dentry->d_inode->i_sb->s_root == dentry)
1812 /* No open handle to close? Move away */
1813 if (!it_disposition(it, DISP_OPEN_OPEN))
1816 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1818 OBD_ALLOC(och, sizeof(*och));
1820 GOTO(out, rc = -ENOMEM);
1822 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1824 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1827 /* this one is in place of ll_file_open */
1828 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1829 ptlrpc_req_finished(it->d.lustre.it_data);
1830 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1836 * Get size for inode for which FIEMAP mapping is requested.
1837 * Make the FIEMAP get_info call and returns the result.
1839 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1842 struct obd_export *exp = ll_i2dtexp(inode);
1843 struct lov_stripe_md *lsm = NULL;
1844 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1845 __u32 vallen = num_bytes;
1849 /* Checks for fiemap flags */
1850 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1851 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1855 /* Check for FIEMAP_FLAG_SYNC */
1856 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1857 rc = filemap_fdatawrite(inode->i_mapping);
1862 lsm = ccc_inode_lsm_get(inode);
1866 /* If the stripe_count > 1 and the application does not understand
1867 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1869 if (lsm->lsm_stripe_count > 1 &&
1870 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1871 GOTO(out, rc = -EOPNOTSUPP);
1873 fm_key.oa.o_oi = lsm->lsm_oi;
1874 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1876 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1877 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1878 /* If filesize is 0, then there would be no objects for mapping */
1879 if (fm_key.oa.o_size == 0) {
1880 fiemap->fm_mapped_extents = 0;
1884 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1886 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1889 CERROR("obd_get_info failed: rc = %d\n", rc);
1892 ccc_inode_lsm_put(inode, lsm);
1896 int ll_fid2path(struct inode *inode, void *arg)
1898 struct obd_export *exp = ll_i2mdexp(inode);
1899 struct getinfo_fid2path *gfout, *gfin;
1903 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1904 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1907 /* Need to get the buflen */
1908 OBD_ALLOC_PTR(gfin);
1911 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1916 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1917 OBD_ALLOC(gfout, outsize);
1918 if (gfout == NULL) {
1922 memcpy(gfout, gfin, sizeof(*gfout));
1925 /* Call mdc_iocontrol */
1926 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1930 if (copy_to_user(arg, gfout, outsize))
1934 OBD_FREE(gfout, outsize);
1938 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1940 struct ll_user_fiemap *fiemap_s;
1941 size_t num_bytes, ret_bytes;
1942 unsigned int extent_count;
1945 /* Get the extent count so we can calculate the size of
1946 * required fiemap buffer */
1947 if (get_user(extent_count,
1948 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1950 num_bytes = sizeof(*fiemap_s) + (extent_count *
1951 sizeof(struct ll_fiemap_extent));
1953 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1954 if (fiemap_s == NULL)
1957 /* get the fiemap value */
1958 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1960 GOTO(error, rc = -EFAULT);
1962 /* If fm_extent_count is non-zero, read the first extent since
1963 * it is used to calculate end_offset and device from previous
1966 if (copy_from_user(&fiemap_s->fm_extents[0],
1967 (char __user *)arg + sizeof(*fiemap_s),
1968 sizeof(struct ll_fiemap_extent)))
1969 GOTO(error, rc = -EFAULT);
1972 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1976 ret_bytes = sizeof(struct ll_user_fiemap);
1978 if (extent_count != 0)
1979 ret_bytes += (fiemap_s->fm_mapped_extents *
1980 sizeof(struct ll_fiemap_extent));
1982 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1986 OBD_FREE_LARGE(fiemap_s, num_bytes);
1991 * Read the data_version for inode.
1993 * This value is computed using stripe object version on OST.
1994 * Version is computed using server side locking.
1996 * @param sync if do sync on the OST side;
1998 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1999 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2001 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2003 struct lov_stripe_md *lsm = NULL;
2004 struct ll_sb_info *sbi = ll_i2sbi(inode);
2005 struct obdo *obdo = NULL;
2009 /* If no stripe, we consider version is 0. */
2010 lsm = ccc_inode_lsm_get(inode);
2011 if (!lsm_has_objects(lsm)) {
2013 CDEBUG(D_INODE, "No object for inode\n");
2017 OBD_ALLOC_PTR(obdo);
2019 GOTO(out, rc = -ENOMEM);
2021 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2023 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2026 *data_version = obdo->o_data_version;
2032 ccc_inode_lsm_put(inode, lsm);
2037 * Trigger a HSM release request for the provided inode.
2039 int ll_hsm_release(struct inode *inode)
2041 struct cl_env_nest nest;
2043 struct obd_client_handle *och = NULL;
2044 __u64 data_version = 0;
2048 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2049 ll_get_fsname(inode->i_sb, NULL, 0),
2050 PFID(&ll_i2info(inode)->lli_fid));
2052 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2054 GOTO(out, rc = PTR_ERR(och));
2056 /* Grab latest data_version and [am]time values */
2057 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2061 env = cl_env_nested_get(&nest);
2063 GOTO(out, rc = PTR_ERR(env));
2065 ll_merge_lvb(env, inode);
2066 cl_env_nested_put(&nest, env);
2068 /* Release the file.
2069 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2070 * we still need it to pack l_remote_handle to MDT. */
2071 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2077 if (och != NULL && !IS_ERR(och)) /* close the file */
2078 ll_lease_close(och, inode, NULL);
2083 struct ll_swap_stack {
2084 struct iattr ia1, ia2;
2086 struct inode *inode1, *inode2;
2087 bool check_dv1, check_dv2;
2090 static int ll_swap_layouts(struct file *file1, struct file *file2,
2091 struct lustre_swap_layouts *lsl)
2093 struct mdc_swap_layouts msl;
2094 struct md_op_data *op_data;
2097 struct ll_swap_stack *llss = NULL;
2100 OBD_ALLOC_PTR(llss);
2104 llss->inode1 = file1->f_dentry->d_inode;
2105 llss->inode2 = file2->f_dentry->d_inode;
2107 if (!S_ISREG(llss->inode2->i_mode))
2108 GOTO(free, rc = -EINVAL);
2110 if (inode_permission(llss->inode1, MAY_WRITE) ||
2111 inode_permission(llss->inode2, MAY_WRITE))
2112 GOTO(free, rc = -EPERM);
2114 if (llss->inode2->i_sb != llss->inode1->i_sb)
2115 GOTO(free, rc = -EXDEV);
2117 /* we use 2 bool because it is easier to swap than 2 bits */
2118 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2119 llss->check_dv1 = true;
2121 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2122 llss->check_dv2 = true;
2124 /* we cannot use lsl->sl_dvX directly because we may swap them */
2125 llss->dv1 = lsl->sl_dv1;
2126 llss->dv2 = lsl->sl_dv2;
2128 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2129 if (rc == 0) /* same file, done! */
2132 if (rc < 0) { /* sequentialize it */
2133 swap(llss->inode1, llss->inode2);
2135 swap(llss->dv1, llss->dv2);
2136 swap(llss->check_dv1, llss->check_dv2);
2140 if (gid != 0) { /* application asks to flush dirty cache */
2141 rc = ll_get_grouplock(llss->inode1, file1, gid);
2145 rc = ll_get_grouplock(llss->inode2, file2, gid);
2147 ll_put_grouplock(llss->inode1, file1, gid);
2152 /* to be able to restore mtime and atime after swap
2153 * we need to first save them */
2155 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2156 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2157 llss->ia1.ia_atime = llss->inode1->i_atime;
2158 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2159 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2160 llss->ia2.ia_atime = llss->inode2->i_atime;
2161 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2164 /* ultimate check, before swaping the layouts we check if
2165 * dataversion has changed (if requested) */
2166 if (llss->check_dv1) {
2167 rc = ll_data_version(llss->inode1, &dv, 0);
2170 if (dv != llss->dv1)
2171 GOTO(putgl, rc = -EAGAIN);
2174 if (llss->check_dv2) {
2175 rc = ll_data_version(llss->inode2, &dv, 0);
2178 if (dv != llss->dv2)
2179 GOTO(putgl, rc = -EAGAIN);
2182 /* struct md_op_data is used to send the swap args to the mdt
2183 * only flags is missing, so we use struct mdc_swap_layouts
2184 * through the md_op_data->op_data */
2185 /* flags from user space have to be converted before they are send to
2186 * server, no flag is sent today, they are only used on the client */
2189 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2190 0, LUSTRE_OPC_ANY, &msl);
2191 if (IS_ERR(op_data))
2192 GOTO(free, rc = PTR_ERR(op_data));
2194 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2195 sizeof(*op_data), op_data, NULL);
2196 ll_finish_md_op_data(op_data);
2200 ll_put_grouplock(llss->inode2, file2, gid);
2201 ll_put_grouplock(llss->inode1, file1, gid);
2204 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2208 /* clear useless flags */
2209 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2210 llss->ia1.ia_valid &= ~ATTR_MTIME;
2211 llss->ia2.ia_valid &= ~ATTR_MTIME;
2214 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2215 llss->ia1.ia_valid &= ~ATTR_ATIME;
2216 llss->ia2.ia_valid &= ~ATTR_ATIME;
2219 /* update time if requested */
2221 if (llss->ia2.ia_valid != 0) {
2222 mutex_lock(&llss->inode1->i_mutex);
2223 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2224 mutex_unlock(&llss->inode1->i_mutex);
2227 if (llss->ia1.ia_valid != 0) {
2230 mutex_lock(&llss->inode2->i_mutex);
2231 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2232 mutex_unlock(&llss->inode2->i_mutex);
2244 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2246 struct md_op_data *op_data;
2249 /* Non-root users are forbidden to set or clear flags which are
2250 * NOT defined in HSM_USER_MASK. */
2251 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2252 !cfs_capable(CFS_CAP_SYS_ADMIN))
2255 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2256 LUSTRE_OPC_ANY, hss);
2257 if (IS_ERR(op_data))
2258 RETURN(PTR_ERR(op_data));
2260 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2261 sizeof(*op_data), op_data, NULL);
2263 ll_finish_md_op_data(op_data);
2268 static int ll_hsm_import(struct inode *inode, struct file *file,
2269 struct hsm_user_import *hui)
2271 struct hsm_state_set *hss = NULL;
2272 struct iattr *attr = NULL;
2276 if (!S_ISREG(inode->i_mode))
2282 GOTO(out, rc = -ENOMEM);
2284 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2285 hss->hss_archive_id = hui->hui_archive_id;
2286 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2287 rc = ll_hsm_state_set(inode, hss);
2291 OBD_ALLOC_PTR(attr);
2293 GOTO(out, rc = -ENOMEM);
2295 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2296 attr->ia_mode |= S_IFREG;
2297 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2298 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2299 attr->ia_size = hui->hui_size;
2300 attr->ia_mtime.tv_sec = hui->hui_mtime;
2301 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2302 attr->ia_atime.tv_sec = hui->hui_atime;
2303 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2305 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2306 ATTR_UID | ATTR_GID |
2307 ATTR_MTIME | ATTR_MTIME_SET |
2308 ATTR_ATIME | ATTR_ATIME_SET;
2310 rc = ll_setattr_raw(file->f_dentry, attr, true);
2325 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2327 struct inode *inode = file->f_dentry->d_inode;
2328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2332 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2333 PFID(ll_inode2fid(inode)), inode, cmd);
2334 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2336 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2337 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2341 case LL_IOC_GETFLAGS:
2342 /* Get the current value of the file flags */
2343 return put_user(fd->fd_flags, (int *)arg);
2344 case LL_IOC_SETFLAGS:
2345 case LL_IOC_CLRFLAGS:
2346 /* Set or clear specific file flags */
2347 /* XXX This probably needs checks to ensure the flags are
2348 * not abused, and to handle any flag side effects.
2350 if (get_user(flags, (int *) arg))
2353 if (cmd == LL_IOC_SETFLAGS) {
2354 if ((flags & LL_FILE_IGNORE_LOCK) &&
2355 !(file->f_flags & O_DIRECT)) {
2356 CERROR("%s: unable to disable locking on "
2357 "non-O_DIRECT file\n", current->comm);
2361 fd->fd_flags |= flags;
2363 fd->fd_flags &= ~flags;
2366 case LL_IOC_LOV_SETSTRIPE:
2367 RETURN(ll_lov_setstripe(inode, file, arg));
2368 case LL_IOC_LOV_SETEA:
2369 RETURN(ll_lov_setea(inode, file, arg));
2370 case LL_IOC_LOV_SWAP_LAYOUTS: {
2372 struct lustre_swap_layouts lsl;
2374 if (copy_from_user(&lsl, (char *)arg,
2375 sizeof(struct lustre_swap_layouts)))
2378 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2381 file2 = fget(lsl.sl_fd);
2386 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2387 rc = ll_swap_layouts(file, file2, &lsl);
2391 case LL_IOC_LOV_GETSTRIPE:
2392 RETURN(ll_lov_getstripe(inode, arg));
2393 case LL_IOC_RECREATE_OBJ:
2394 RETURN(ll_lov_recreate_obj(inode, arg));
2395 case LL_IOC_RECREATE_FID:
2396 RETURN(ll_lov_recreate_fid(inode, arg));
2397 case FSFILT_IOC_FIEMAP:
2398 RETURN(ll_ioctl_fiemap(inode, arg));
2399 case FSFILT_IOC_GETFLAGS:
2400 case FSFILT_IOC_SETFLAGS:
2401 RETURN(ll_iocontrol(inode, file, cmd, arg));
2402 case FSFILT_IOC_GETVERSION_OLD:
2403 case FSFILT_IOC_GETVERSION:
2404 RETURN(put_user(inode->i_generation, (int *)arg));
2405 case LL_IOC_GROUP_LOCK:
2406 RETURN(ll_get_grouplock(inode, file, arg));
2407 case LL_IOC_GROUP_UNLOCK:
2408 RETURN(ll_put_grouplock(inode, file, arg));
2409 case IOC_OBD_STATFS:
2410 RETURN(ll_obd_statfs(inode, (void *)arg));
2412 /* We need to special case any other ioctls we want to handle,
2413 * to send them to the MDS/OST as appropriate and to properly
2414 * network encode the arg field.
2415 case FSFILT_IOC_SETVERSION_OLD:
2416 case FSFILT_IOC_SETVERSION:
2418 case LL_IOC_FLUSHCTX:
2419 RETURN(ll_flush_ctx(inode));
2420 case LL_IOC_PATH2FID: {
2421 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2422 sizeof(struct lu_fid)))
2427 case OBD_IOC_FID2PATH:
2428 RETURN(ll_fid2path(inode, (void *)arg));
2429 case LL_IOC_DATA_VERSION: {
2430 struct ioc_data_version idv;
2433 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2436 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2437 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2439 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2445 case LL_IOC_GET_MDTIDX: {
2448 mdtidx = ll_get_mdt_idx(inode);
2452 if (put_user((int)mdtidx, (int*)arg))
2457 case OBD_IOC_GETDTNAME:
2458 case OBD_IOC_GETMDNAME:
2459 RETURN(ll_get_obd_name(inode, cmd, arg));
2460 case LL_IOC_HSM_STATE_GET: {
2461 struct md_op_data *op_data;
2462 struct hsm_user_state *hus;
2469 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2470 LUSTRE_OPC_ANY, hus);
2471 if (IS_ERR(op_data)) {
2473 RETURN(PTR_ERR(op_data));
2476 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2479 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2482 ll_finish_md_op_data(op_data);
2486 case LL_IOC_HSM_STATE_SET: {
2487 struct hsm_state_set *hss;
2494 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2499 rc = ll_hsm_state_set(inode, hss);
2504 case LL_IOC_HSM_ACTION: {
2505 struct md_op_data *op_data;
2506 struct hsm_current_action *hca;
2513 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2514 LUSTRE_OPC_ANY, hca);
2515 if (IS_ERR(op_data)) {
2517 RETURN(PTR_ERR(op_data));
2520 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2523 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2526 ll_finish_md_op_data(op_data);
2530 case LL_IOC_SET_LEASE: {
2531 struct ll_inode_info *lli = ll_i2info(inode);
2532 struct obd_client_handle *och = NULL;
2538 if (!(file->f_mode & FMODE_WRITE))
2543 if (!(file->f_mode & FMODE_READ))
2548 mutex_lock(&lli->lli_och_mutex);
2549 if (fd->fd_lease_och != NULL) {
2550 och = fd->fd_lease_och;
2551 fd->fd_lease_och = NULL;
2553 mutex_unlock(&lli->lli_och_mutex);
2556 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2557 rc = ll_lease_close(och, inode, &lease_broken);
2558 if (rc == 0 && lease_broken)
2564 /* return the type of lease or error */
2565 RETURN(rc < 0 ? rc : (int)mode);
2570 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2572 /* apply for lease */
2573 och = ll_lease_open(inode, file, mode, 0);
2575 RETURN(PTR_ERR(och));
2578 mutex_lock(&lli->lli_och_mutex);
2579 if (fd->fd_lease_och == NULL) {
2580 fd->fd_lease_och = och;
2583 mutex_unlock(&lli->lli_och_mutex);
2585 /* impossible now that only excl is supported for now */
2586 ll_lease_close(och, inode, &lease_broken);
2591 case LL_IOC_GET_LEASE: {
2592 struct ll_inode_info *lli = ll_i2info(inode);
2593 struct ldlm_lock *lock = NULL;
2596 mutex_lock(&lli->lli_och_mutex);
2597 if (fd->fd_lease_och != NULL) {
2598 struct obd_client_handle *och = fd->fd_lease_och;
2600 lock = ldlm_handle2lock(&och->och_lease_handle);
2602 lock_res_and_lock(lock);
2603 if (!ldlm_is_cancel(lock))
2604 rc = och->och_flags &
2605 (FMODE_READ | FMODE_WRITE);
2606 unlock_res_and_lock(lock);
2607 LDLM_LOCK_PUT(lock);
2610 mutex_unlock(&lli->lli_och_mutex);
2613 case LL_IOC_HSM_IMPORT: {
2614 struct hsm_user_import *hui;
2620 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2625 rc = ll_hsm_import(inode, file, hui);
2635 ll_iocontrol_call(inode, file, cmd, arg, &err))
2638 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2644 #ifndef HAVE_FILE_LLSEEK_SIZE
2645 static inline loff_t
2646 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2648 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2650 if (offset > maxsize)
2653 if (offset != file->f_pos) {
2654 file->f_pos = offset;
2655 file->f_version = 0;
2661 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2662 loff_t maxsize, loff_t eof)
2664 struct inode *inode = file->f_dentry->d_inode;
2672 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2673 * position-querying operation. Avoid rewriting the "same"
2674 * f_pos value back to the file because a concurrent read(),
2675 * write() or lseek() might have altered it
2680 * f_lock protects against read/modify/write race with other
2681 * SEEK_CURs. Note that parallel writes and reads behave
2684 mutex_lock(&inode->i_mutex);
2685 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2686 mutex_unlock(&inode->i_mutex);
2690 * In the generic case the entire file is data, so as long as
2691 * offset isn't at the end of the file then the offset is data.
2698 * There is a virtual hole at the end of the file, so as long as
2699 * offset isn't i_size or larger, return i_size.
2707 return llseek_execute(file, offset, maxsize);
2711 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2713 struct inode *inode = file->f_dentry->d_inode;
2714 loff_t retval, eof = 0;
2717 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2718 (origin == SEEK_CUR) ? file->f_pos : 0);
2719 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2720 PFID(ll_inode2fid(inode)), inode, retval, retval,
2722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2724 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2725 retval = ll_glimpse_size(inode);
2728 eof = i_size_read(inode);
2731 retval = ll_generic_file_llseek_size(file, offset, origin,
2732 ll_file_maxbytes(inode), eof);
2736 static int ll_flush(struct file *file, fl_owner_t id)
2738 struct inode *inode = file->f_dentry->d_inode;
2739 struct ll_inode_info *lli = ll_i2info(inode);
2740 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2743 LASSERT(!S_ISDIR(inode->i_mode));
2745 /* catch async errors that were recorded back when async writeback
2746 * failed for pages in this mapping. */
2747 rc = lli->lli_async_rc;
2748 lli->lli_async_rc = 0;
2749 if (lli->lli_clob != NULL) {
2750 err = lov_read_and_clear_async_rc(lli->lli_clob);
2755 /* The application has been told write failure already.
2756 * Do not report failure again. */
2757 if (fd->fd_write_failed)
2759 return rc ? -EIO : 0;
2763 * Called to make sure a portion of file has been written out.
2764 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2766 * Return how many pages have been written.
2768 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2769 enum cl_fsync_mode mode, int ignore_layout)
2771 struct cl_env_nest nest;
2774 struct obd_capa *capa = NULL;
2775 struct cl_fsync_io *fio;
2779 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2780 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2783 env = cl_env_nested_get(&nest);
2785 RETURN(PTR_ERR(env));
2787 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2789 io = ccc_env_thread_io(env);
2790 io->ci_obj = cl_i2info(inode)->lli_clob;
2791 io->ci_ignore_layout = ignore_layout;
2793 /* initialize parameters for sync */
2794 fio = &io->u.ci_fsync;
2795 fio->fi_capa = capa;
2796 fio->fi_start = start;
2798 fio->fi_fid = ll_inode2fid(inode);
2799 fio->fi_mode = mode;
2800 fio->fi_nr_written = 0;
2802 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2803 result = cl_io_loop(env, io);
2805 result = io->ci_result;
2807 result = fio->fi_nr_written;
2808 cl_io_fini(env, io);
2809 cl_env_nested_put(&nest, env);
2817 * When dentry is provided (the 'else' case), *file->f_dentry may be
2818 * null and dentry must be used directly rather than pulled from
2819 * *file->f_dentry as is done otherwise.
2822 #ifdef HAVE_FILE_FSYNC_4ARGS
2823 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2825 struct dentry *dentry = file->f_dentry;
2826 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2827 int ll_fsync(struct file *file, int datasync)
2829 struct dentry *dentry = file->f_dentry;
2831 loff_t end = LLONG_MAX;
2833 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2836 loff_t end = LLONG_MAX;
2838 struct inode *inode = dentry->d_inode;
2839 struct ll_inode_info *lli = ll_i2info(inode);
2840 struct ptlrpc_request *req;
2841 struct obd_capa *oc;
2845 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2846 PFID(ll_inode2fid(inode)), inode);
2847 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2849 #ifdef HAVE_FILE_FSYNC_4ARGS
2850 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2851 mutex_lock(&inode->i_mutex);
2853 /* fsync's caller has already called _fdata{sync,write}, we want
2854 * that IO to finish before calling the osc and mdc sync methods */
2855 rc = filemap_fdatawait(inode->i_mapping);
2858 /* catch async errors that were recorded back when async writeback
2859 * failed for pages in this mapping. */
2860 if (!S_ISDIR(inode->i_mode)) {
2861 err = lli->lli_async_rc;
2862 lli->lli_async_rc = 0;
2865 err = lov_read_and_clear_async_rc(lli->lli_clob);
2870 oc = ll_mdscapa_get(inode);
2871 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2877 ptlrpc_req_finished(req);
2879 if (S_ISREG(inode->i_mode)) {
2880 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2882 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2883 if (rc == 0 && err < 0)
2886 fd->fd_write_failed = true;
2888 fd->fd_write_failed = false;
2891 #ifdef HAVE_FILE_FSYNC_4ARGS
2892 mutex_unlock(&inode->i_mutex);
2898 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2900 struct inode *inode = file->f_dentry->d_inode;
2901 struct ll_sb_info *sbi = ll_i2sbi(inode);
2902 struct ldlm_enqueue_info einfo = {
2903 .ei_type = LDLM_FLOCK,
2904 .ei_cb_cp = ldlm_flock_completion_ast,
2905 .ei_cbdata = file_lock,
2907 struct md_op_data *op_data;
2908 struct lustre_handle lockh = {0};
2909 ldlm_policy_data_t flock = {{0}};
2910 int fl_type = file_lock->fl_type;
2916 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2917 PFID(ll_inode2fid(inode)), file_lock);
2919 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2921 if (file_lock->fl_flags & FL_FLOCK) {
2922 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2923 /* flocks are whole-file locks */
2924 flock.l_flock.end = OFFSET_MAX;
2925 /* For flocks owner is determined by the local file desctiptor*/
2926 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2927 } else if (file_lock->fl_flags & FL_POSIX) {
2928 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2929 flock.l_flock.start = file_lock->fl_start;
2930 flock.l_flock.end = file_lock->fl_end;
2934 flock.l_flock.pid = file_lock->fl_pid;
2936 /* Somewhat ugly workaround for svc lockd.
2937 * lockd installs custom fl_lmops->lm_compare_owner that checks
2938 * for the fl_owner to be the same (which it always is on local node
2939 * I guess between lockd processes) and then compares pid.
2940 * As such we assign pid to the owner field to make it all work,
2941 * conflict with normal locks is unlikely since pid space and
2942 * pointer space for current->files are not intersecting */
2943 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2944 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2948 einfo.ei_mode = LCK_PR;
2951 /* An unlock request may or may not have any relation to
2952 * existing locks so we may not be able to pass a lock handle
2953 * via a normal ldlm_lock_cancel() request. The request may even
2954 * unlock a byte range in the middle of an existing lock. In
2955 * order to process an unlock request we need all of the same
2956 * information that is given with a normal read or write record
2957 * lock request. To avoid creating another ldlm unlock (cancel)
2958 * message we'll treat a LCK_NL flock request as an unlock. */
2959 einfo.ei_mode = LCK_NL;
2962 einfo.ei_mode = LCK_PW;
2965 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2980 flags = LDLM_FL_BLOCK_NOWAIT;
2986 flags = LDLM_FL_TEST_LOCK;
2989 CERROR("unknown fcntl lock command: %d\n", cmd);
2993 /* Save the old mode so that if the mode in the lock changes we
2994 * can decrement the appropriate reader or writer refcount. */
2995 file_lock->fl_type = einfo.ei_mode;
2997 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2998 LUSTRE_OPC_ANY, NULL);
2999 if (IS_ERR(op_data))
3000 RETURN(PTR_ERR(op_data));
3002 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3003 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3004 flock.l_flock.pid, flags, einfo.ei_mode,
3005 flock.l_flock.start, flock.l_flock.end);
3007 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3010 /* Restore the file lock type if not TEST lock. */
3011 if (!(flags & LDLM_FL_TEST_LOCK))
3012 file_lock->fl_type = fl_type;
3014 if ((file_lock->fl_flags & FL_FLOCK) &&
3015 (rc == 0 || file_lock->fl_type == F_UNLCK))
3016 rc2 = flock_lock_file_wait(file, file_lock);
3017 if ((file_lock->fl_flags & FL_POSIX) &&
3018 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3019 !(flags & LDLM_FL_TEST_LOCK))
3020 rc2 = posix_lock_file_wait(file, file_lock);
3022 if (rc2 && file_lock->fl_type != F_UNLCK) {
3023 einfo.ei_mode = LCK_NL;
3024 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3029 ll_finish_md_op_data(op_data);
3034 int ll_get_fid_by_name(struct inode *parent, const char *name,
3035 int namelen, struct lu_fid *fid)
3037 struct md_op_data *op_data = NULL;
3038 struct mdt_body *body;
3039 struct ptlrpc_request *req;
3043 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3044 LUSTRE_OPC_ANY, NULL);
3045 if (IS_ERR(op_data))
3046 RETURN(PTR_ERR(op_data));
3048 op_data->op_valid = OBD_MD_FLID;
3049 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3050 ll_finish_md_op_data(op_data);
3054 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3056 GOTO(out_req, rc = -EFAULT);
3058 *fid = body->mbo_fid1;
3060 ptlrpc_req_finished(req);
3064 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3065 const char *name, int namelen)
3067 struct dentry *dchild = NULL;
3068 struct inode *child_inode = NULL;
3069 struct md_op_data *op_data;
3070 struct ptlrpc_request *request = NULL;
3075 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3076 name, PFID(ll_inode2fid(parent)), mdtidx);
3078 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3079 0, LUSTRE_OPC_ANY, NULL);
3080 if (IS_ERR(op_data))
3081 RETURN(PTR_ERR(op_data));
3083 /* Get child FID first */
3084 qstr.hash = full_name_hash(name, namelen);
3087 dchild = d_lookup(file->f_dentry, &qstr);
3088 if (dchild != NULL && dchild->d_inode != NULL) {
3089 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3090 if (dchild->d_inode != NULL) {
3091 child_inode = igrab(dchild->d_inode);
3092 ll_invalidate_aliases(child_inode);
3096 rc = ll_get_fid_by_name(parent, name, namelen,
3102 if (!fid_is_sane(&op_data->op_fid3)) {
3103 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3104 ll_get_fsname(parent->i_sb, NULL, 0), name,
3105 PFID(&op_data->op_fid3));
3106 GOTO(out_free, rc = -EINVAL);
3109 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3114 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3115 PFID(&op_data->op_fid3), mdtidx);
3116 GOTO(out_free, rc = 0);
3119 op_data->op_mds = mdtidx;
3120 op_data->op_cli_flags = CLI_MIGRATE;
3121 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3122 namelen, name, namelen, &request);
3124 ll_update_times(request, parent);
3126 ptlrpc_req_finished(request);
3131 if (child_inode != NULL) {
3132 clear_nlink(child_inode);
3136 ll_finish_md_op_data(op_data);
3141 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3149 * test if some locks matching bits and l_req_mode are acquired
3150 * - bits can be in different locks
3151 * - if found clear the common lock bits in *bits
3152 * - the bits not found, are kept in *bits
3154 * \param bits [IN] searched lock bits [IN]
3155 * \param l_req_mode [IN] searched lock mode
3156 * \retval boolean, true iff all bits are found
3158 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3160 struct lustre_handle lockh;
3161 ldlm_policy_data_t policy;
3162 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3163 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3172 fid = &ll_i2info(inode)->lli_fid;
3173 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3174 ldlm_lockname[mode]);
3176 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3177 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3178 policy.l_inodebits.bits = *bits & (1 << i);
3179 if (policy.l_inodebits.bits == 0)
3182 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3183 &policy, mode, &lockh)) {
3184 struct ldlm_lock *lock;
3186 lock = ldlm_handle2lock(&lockh);
3189 ~(lock->l_policy_data.l_inodebits.bits);
3190 LDLM_LOCK_PUT(lock);
3192 *bits &= ~policy.l_inodebits.bits;
3199 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3200 struct lustre_handle *lockh, __u64 flags,
3203 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3208 fid = &ll_i2info(inode)->lli_fid;
3209 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3211 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3212 fid, LDLM_IBITS, &policy, mode, lockh);
3217 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3219 /* Already unlinked. Just update nlink and return success */
3220 if (rc == -ENOENT) {
3222 /* This path cannot be hit for regular files unless in
3223 * case of obscure races, so no need to to validate
3225 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3227 } else if (rc != 0) {
3228 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3229 "%s: revalidate FID "DFID" error: rc = %d\n",
3230 ll_get_fsname(inode->i_sb, NULL, 0),
3231 PFID(ll_inode2fid(inode)), rc);
3237 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3239 struct inode *inode = dentry->d_inode;
3240 struct ptlrpc_request *req = NULL;
3241 struct obd_export *exp;
3245 LASSERT(inode != NULL);
3247 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3248 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3250 exp = ll_i2mdexp(inode);
3252 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3253 * But under CMD case, it caused some lock issues, should be fixed
3254 * with new CMD ibits lock. See bug 12718 */
3255 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3256 struct lookup_intent oit = { .it_op = IT_GETATTR };
3257 struct md_op_data *op_data;
3259 if (ibits == MDS_INODELOCK_LOOKUP)
3260 oit.it_op = IT_LOOKUP;
3262 /* Call getattr by fid, so do not provide name at all. */
3263 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3264 dentry->d_inode, NULL, 0, 0,
3265 LUSTRE_OPC_ANY, NULL);
3266 if (IS_ERR(op_data))
3267 RETURN(PTR_ERR(op_data));
3269 rc = md_intent_lock(exp, op_data, &oit, &req,
3270 &ll_md_blocking_ast, 0);
3271 ll_finish_md_op_data(op_data);
3273 rc = ll_inode_revalidate_fini(inode, rc);
3277 rc = ll_revalidate_it_finish(req, &oit, dentry);
3279 ll_intent_release(&oit);
3283 /* Unlinked? Unhash dentry, so it is not picked up later by
3284 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3285 here to preserve get_cwd functionality on 2.6.
3287 if (!dentry->d_inode->i_nlink)
3288 d_lustre_invalidate(dentry, 0);
3290 ll_lookup_finish_locks(&oit, dentry);
3291 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3292 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3293 obd_valid valid = OBD_MD_FLGETATTR;
3294 struct md_op_data *op_data;
3297 if (S_ISREG(inode->i_mode)) {
3298 rc = ll_get_default_mdsize(sbi, &ealen);
3301 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3304 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3305 0, ealen, LUSTRE_OPC_ANY,
3307 if (IS_ERR(op_data))
3308 RETURN(PTR_ERR(op_data));
3310 op_data->op_valid = valid;
3311 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3312 * capa for this inode. Because we only keep capas of dirs
3314 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3315 ll_finish_md_op_data(op_data);
3317 rc = ll_inode_revalidate_fini(inode, rc);
3321 rc = ll_prep_inode(&inode, req, NULL, NULL);
3324 ptlrpc_req_finished(req);
3328 static int ll_merge_md_attr(struct inode *inode)
3330 struct cl_attr attr = { 0 };
3333 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3334 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3339 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3340 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3342 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3343 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3344 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3350 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3352 struct inode *inode = dentry->d_inode;
3356 rc = __ll_inode_revalidate(dentry, ibits);
3360 /* if object isn't regular file, don't validate size */
3361 if (!S_ISREG(inode->i_mode)) {
3362 if (S_ISDIR(inode->i_mode) &&
3363 ll_i2info(inode)->lli_lsm_md != NULL) {
3364 rc = ll_merge_md_attr(inode);
3369 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3370 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3371 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3373 /* In case of restore, the MDT has the right size and has
3374 * already send it back without granting the layout lock,
3375 * inode is up-to-date so glimpse is useless.
3376 * Also to glimpse we need the layout, in case of a running
3377 * restore the MDT holds the layout lock so the glimpse will
3378 * block up to the end of restore (getattr will block)
3380 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3381 rc = ll_glimpse_size(inode);
3386 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3388 struct inode *inode = de->d_inode;
3389 struct ll_sb_info *sbi = ll_i2sbi(inode);
3390 struct ll_inode_info *lli = ll_i2info(inode);
3393 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3394 MDS_INODELOCK_LOOKUP);
3395 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3400 stat->dev = inode->i_sb->s_dev;
3401 if (ll_need_32bit_api(sbi))
3402 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3404 stat->ino = inode->i_ino;
3405 stat->mode = inode->i_mode;
3406 stat->uid = inode->i_uid;
3407 stat->gid = inode->i_gid;
3408 stat->rdev = inode->i_rdev;
3409 stat->atime = inode->i_atime;
3410 stat->mtime = inode->i_mtime;
3411 stat->ctime = inode->i_ctime;
3412 stat->blksize = 1 << inode->i_blkbits;
3413 stat->blocks = inode->i_blocks;
3415 if (S_ISDIR(inode->i_mode) &&
3416 ll_i2info(inode)->lli_lsm_md != NULL) {
3417 stat->nlink = lli->lli_stripe_dir_nlink;
3418 stat->size = lli->lli_stripe_dir_size;
3420 stat->nlink = inode->i_nlink;
3421 stat->size = i_size_read(inode);
3427 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3428 __u64 start, __u64 len)
3432 struct ll_user_fiemap *fiemap;
3433 unsigned int extent_count = fieinfo->fi_extents_max;
3435 num_bytes = sizeof(*fiemap) + (extent_count *
3436 sizeof(struct ll_fiemap_extent));
3437 OBD_ALLOC_LARGE(fiemap, num_bytes);
3442 fiemap->fm_flags = fieinfo->fi_flags;
3443 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3444 fiemap->fm_start = start;
3445 fiemap->fm_length = len;
3446 if (extent_count > 0)
3447 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3448 sizeof(struct ll_fiemap_extent));
3450 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3452 fieinfo->fi_flags = fiemap->fm_flags;
3453 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3454 if (extent_count > 0)
3455 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3456 fiemap->fm_mapped_extents *
3457 sizeof(struct ll_fiemap_extent));
3459 OBD_FREE_LARGE(fiemap, num_bytes);
3463 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3465 struct ll_inode_info *lli = ll_i2info(inode);
3466 struct posix_acl *acl = NULL;
3469 spin_lock(&lli->lli_lock);
3470 /* VFS' acl_permission_check->check_acl will release the refcount */
3471 acl = posix_acl_dup(lli->lli_posix_acl);
3472 spin_unlock(&lli->lli_lock);
3477 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3479 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3480 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3482 ll_check_acl(struct inode *inode, int mask)
3485 # ifdef CONFIG_FS_POSIX_ACL
3486 struct posix_acl *acl;
3490 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3491 if (flags & IPERM_FLAG_RCU)
3494 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3499 rc = posix_acl_permission(inode, acl, mask);
3500 posix_acl_release(acl);
3503 # else /* !CONFIG_FS_POSIX_ACL */
3505 # endif /* CONFIG_FS_POSIX_ACL */
3507 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3509 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3510 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3512 # ifdef HAVE_INODE_PERMISION_2ARGS
3513 int ll_inode_permission(struct inode *inode, int mask)
3515 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3520 struct ll_sb_info *sbi;
3521 struct root_squash_info *squash;
3522 struct cred *cred = NULL;
3523 const struct cred *old_cred = NULL;
3525 bool squash_id = false;
3528 #ifdef MAY_NOT_BLOCK
3529 if (mask & MAY_NOT_BLOCK)
3531 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3532 if (flags & IPERM_FLAG_RCU)
3536 /* as root inode are NOT getting validated in lookup operation,
3537 * need to do it before permission check. */
3539 if (inode == inode->i_sb->s_root->d_inode) {
3540 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3541 MDS_INODELOCK_LOOKUP);
3546 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3547 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3549 /* squash fsuid/fsgid if needed */
3550 sbi = ll_i2sbi(inode);
3551 squash = &sbi->ll_squash;
3552 if (unlikely(squash->rsi_uid != 0 &&
3553 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3554 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3558 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3559 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3560 squash->rsi_uid, squash->rsi_gid);
3562 /* update current process's credentials
3563 * and FS capability */
3564 cred = prepare_creds();
3568 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3569 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3570 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3571 if ((1 << cap) & CFS_CAP_FS_MASK)
3572 cap_lower(cred->cap_effective, cap);
3574 old_cred = override_creds(cred);
3577 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3579 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3580 rc = lustre_check_remote_perm(inode, mask);
3582 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3584 /* restore current process's credentials and FS capability */
3586 revert_creds(old_cred);
3593 /* -o localflock - only provides locally consistent flock locks */
3594 struct file_operations ll_file_operations = {
3595 .read = ll_file_read,
3596 .aio_read = ll_file_aio_read,
3597 .write = ll_file_write,
3598 .aio_write = ll_file_aio_write,
3599 .unlocked_ioctl = ll_file_ioctl,
3600 .open = ll_file_open,
3601 .release = ll_file_release,
3602 .mmap = ll_file_mmap,
3603 .llseek = ll_file_seek,
3604 .splice_read = ll_file_splice_read,
3609 struct file_operations ll_file_operations_flock = {
3610 .read = ll_file_read,
3611 .aio_read = ll_file_aio_read,
3612 .write = ll_file_write,
3613 .aio_write = ll_file_aio_write,
3614 .unlocked_ioctl = ll_file_ioctl,
3615 .open = ll_file_open,
3616 .release = ll_file_release,
3617 .mmap = ll_file_mmap,
3618 .llseek = ll_file_seek,
3619 .splice_read = ll_file_splice_read,
3622 .flock = ll_file_flock,
3623 .lock = ll_file_flock
3626 /* These are for -o noflock - to return ENOSYS on flock calls */
3627 struct file_operations ll_file_operations_noflock = {
3628 .read = ll_file_read,
3629 .aio_read = ll_file_aio_read,
3630 .write = ll_file_write,
3631 .aio_write = ll_file_aio_write,
3632 .unlocked_ioctl = ll_file_ioctl,
3633 .open = ll_file_open,
3634 .release = ll_file_release,
3635 .mmap = ll_file_mmap,
3636 .llseek = ll_file_seek,
3637 .splice_read = ll_file_splice_read,
3640 .flock = ll_file_noflock,
3641 .lock = ll_file_noflock
3644 struct inode_operations ll_file_inode_operations = {
3645 .setattr = ll_setattr,
3646 .getattr = ll_getattr,
3647 .permission = ll_inode_permission,
3648 .setxattr = ll_setxattr,
3649 .getxattr = ll_getxattr,
3650 .listxattr = ll_listxattr,
3651 .removexattr = ll_removexattr,
3652 .fiemap = ll_fiemap,
3653 #ifdef HAVE_IOP_GET_ACL
3654 .get_acl = ll_get_acl,
3658 /* dynamic ioctl number support routins */
3659 static struct llioc_ctl_data {
3660 struct rw_semaphore ioc_sem;
3661 struct list_head ioc_head;
3663 __RWSEM_INITIALIZER(llioc.ioc_sem),
3664 LIST_HEAD_INIT(llioc.ioc_head)
3669 struct list_head iocd_list;
3670 unsigned int iocd_size;
3671 llioc_callback_t iocd_cb;
3672 unsigned int iocd_count;
3673 unsigned int iocd_cmd[0];
3676 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3679 struct llioc_data *in_data = NULL;
3682 if (cb == NULL || cmd == NULL ||
3683 count > LLIOC_MAX_CMD || count < 0)
3686 size = sizeof(*in_data) + count * sizeof(unsigned int);
3687 OBD_ALLOC(in_data, size);
3688 if (in_data == NULL)
3691 memset(in_data, 0, sizeof(*in_data));
3692 in_data->iocd_size = size;
3693 in_data->iocd_cb = cb;
3694 in_data->iocd_count = count;
3695 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3697 down_write(&llioc.ioc_sem);
3698 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3699 up_write(&llioc.ioc_sem);
3704 void ll_iocontrol_unregister(void *magic)
3706 struct llioc_data *tmp;
3711 down_write(&llioc.ioc_sem);
3712 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3714 unsigned int size = tmp->iocd_size;
3716 list_del(&tmp->iocd_list);
3717 up_write(&llioc.ioc_sem);
3719 OBD_FREE(tmp, size);
3723 up_write(&llioc.ioc_sem);
3725 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3728 EXPORT_SYMBOL(ll_iocontrol_register);
3729 EXPORT_SYMBOL(ll_iocontrol_unregister);
3731 static enum llioc_iter
3732 ll_iocontrol_call(struct inode *inode, struct file *file,
3733 unsigned int cmd, unsigned long arg, int *rcp)
3735 enum llioc_iter ret = LLIOC_CONT;
3736 struct llioc_data *data;
3737 int rc = -EINVAL, i;
3739 down_read(&llioc.ioc_sem);
3740 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3741 for (i = 0; i < data->iocd_count; i++) {
3742 if (cmd != data->iocd_cmd[i])
3745 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3749 if (ret == LLIOC_STOP)
3752 up_read(&llioc.ioc_sem);
3759 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3761 struct ll_inode_info *lli = ll_i2info(inode);
3762 struct cl_env_nest nest;
3767 if (lli->lli_clob == NULL)
3770 env = cl_env_nested_get(&nest);
3772 RETURN(PTR_ERR(env));
3774 result = cl_conf_set(env, lli->lli_clob, conf);
3775 cl_env_nested_put(&nest, env);
3777 if (conf->coc_opc == OBJECT_CONF_SET) {
3778 struct ldlm_lock *lock = conf->coc_lock;
3780 LASSERT(lock != NULL);
3781 LASSERT(ldlm_has_layout(lock));
3783 struct lustre_md *md = conf->u.coc_md;
3784 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3786 /* it can only be allowed to match after layout is
3787 * applied to inode otherwise false layout would be
3788 * seen. Applying layout shoud happen before dropping
3789 * the intent lock. */
3790 ldlm_lock_allow_match(lock);
3792 lli->lli_has_smd = lsm_has_objects(md->lsm);
3793 if (md->lsm != NULL)
3794 gen = md->lsm->lsm_layout_gen;
3797 DFID ": layout version change: %u -> %u\n",
3798 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3800 ll_layout_version_set(lli, gen);
3806 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3807 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3810 struct ll_sb_info *sbi = ll_i2sbi(inode);
3811 struct obd_capa *oc;
3812 struct ptlrpc_request *req;
3813 struct mdt_body *body;
3820 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3821 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3822 lock->l_lvb_data, lock->l_lvb_len);
3824 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3827 /* if layout lock was granted right away, the layout is returned
3828 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3829 * blocked and then granted via completion ast, we have to fetch
3830 * layout here. Please note that we can't use the LVB buffer in
3831 * completion AST because it doesn't have a large enough buffer */
3832 oc = ll_mdscapa_get(inode);
3833 rc = ll_get_default_mdsize(sbi, &lmmsize);
3835 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3836 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3842 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3844 GOTO(out, rc = -EPROTO);
3846 lmmsize = body->mbo_eadatasize;
3847 if (lmmsize == 0) /* empty layout */
3850 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3852 GOTO(out, rc = -EFAULT);
3854 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3855 if (lvbdata == NULL)
3856 GOTO(out, rc = -ENOMEM);
3858 memcpy(lvbdata, lmm, lmmsize);
3859 lock_res_and_lock(lock);
3860 if (lock->l_lvb_data != NULL)
3861 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3863 lock->l_lvb_data = lvbdata;
3864 lock->l_lvb_len = lmmsize;
3865 unlock_res_and_lock(lock);
3870 ptlrpc_req_finished(req);
3875 * Apply the layout to the inode. Layout lock is held and will be released
3878 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3879 struct inode *inode, __u32 *gen, bool reconf)
3881 struct ll_inode_info *lli = ll_i2info(inode);
3882 struct ll_sb_info *sbi = ll_i2sbi(inode);
3883 struct ldlm_lock *lock;
3884 struct lustre_md md = { NULL };
3885 struct cl_object_conf conf;
3888 bool wait_layout = false;
3891 LASSERT(lustre_handle_is_used(lockh));
3893 lock = ldlm_handle2lock(lockh);
3894 LASSERT(lock != NULL);
3895 LASSERT(ldlm_has_layout(lock));
3897 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3898 PFID(&lli->lli_fid), inode, reconf);
3900 /* in case this is a caching lock and reinstate with new inode */
3901 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3903 lock_res_and_lock(lock);
3904 lvb_ready = ldlm_is_lvb_ready(lock);
3905 unlock_res_and_lock(lock);
3906 /* checking lvb_ready is racy but this is okay. The worst case is
3907 * that multi processes may configure the file on the same time. */
3909 if (lvb_ready || !reconf) {
3912 /* layout_gen must be valid if layout lock is not
3913 * cancelled and stripe has already set */
3914 *gen = ll_layout_version_get(lli);
3920 rc = ll_layout_fetch(inode, lock);
3924 /* for layout lock, lmm is returned in lock's lvb.
3925 * lvb_data is immutable if the lock is held so it's safe to access it
3926 * without res lock. See the description in ldlm_lock_decref_internal()
3927 * for the condition to free lvb_data of layout lock */
3928 if (lock->l_lvb_data != NULL) {
3929 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3930 lock->l_lvb_data, lock->l_lvb_len);
3932 *gen = LL_LAYOUT_GEN_EMPTY;
3934 *gen = md.lsm->lsm_layout_gen;
3937 CERROR("%s: file "DFID" unpackmd error: %d\n",
3938 ll_get_fsname(inode->i_sb, NULL, 0),
3939 PFID(&lli->lli_fid), rc);
3945 /* set layout to file. Unlikely this will fail as old layout was
3946 * surely eliminated */
3947 memset(&conf, 0, sizeof conf);
3948 conf.coc_opc = OBJECT_CONF_SET;
3949 conf.coc_inode = inode;
3950 conf.coc_lock = lock;
3951 conf.u.coc_md = &md;
3952 rc = ll_layout_conf(inode, &conf);
3955 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3957 /* refresh layout failed, need to wait */
3958 wait_layout = rc == -EBUSY;
3962 LDLM_LOCK_PUT(lock);
3963 ldlm_lock_decref(lockh, mode);
3965 /* wait for IO to complete if it's still being used. */
3967 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3968 ll_get_fsname(inode->i_sb, NULL, 0),
3969 PFID(&lli->lli_fid), inode);
3971 memset(&conf, 0, sizeof conf);
3972 conf.coc_opc = OBJECT_CONF_WAIT;
3973 conf.coc_inode = inode;
3974 rc = ll_layout_conf(inode, &conf);
3978 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3979 ll_get_fsname(inode->i_sb, NULL, 0),
3980 PFID(&lli->lli_fid), rc);
3986 * This function checks if there exists a LAYOUT lock on the client side,
3987 * or enqueues it if it doesn't have one in cache.
3989 * This function will not hold layout lock so it may be revoked any time after
3990 * this function returns. Any operations depend on layout should be redone
3993 * This function should be called before lov_io_init() to get an uptodate
3994 * layout version, the caller should save the version number and after IO
3995 * is finished, this function should be called again to verify that layout
3996 * is not changed during IO time.
3998 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4000 struct ll_inode_info *lli = ll_i2info(inode);
4001 struct ll_sb_info *sbi = ll_i2sbi(inode);
4002 struct md_op_data *op_data;
4003 struct lookup_intent it;
4004 struct lustre_handle lockh;
4006 struct ldlm_enqueue_info einfo = {
4007 .ei_type = LDLM_IBITS,
4009 .ei_cb_bl = &ll_md_blocking_ast,
4010 .ei_cb_cp = &ldlm_completion_ast,
4015 *gen = ll_layout_version_get(lli);
4016 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4020 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4021 LASSERT(S_ISREG(inode->i_mode));
4023 /* take layout lock mutex to enqueue layout lock exclusively. */
4024 mutex_lock(&lli->lli_layout_mutex);
4027 /* mostly layout lock is caching on the local side, so try to match
4028 * it before grabbing layout lock mutex. */
4029 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4030 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4031 if (mode != 0) { /* hit cached lock */
4032 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4036 mutex_unlock(&lli->lli_layout_mutex);
4040 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4041 0, 0, LUSTRE_OPC_ANY, NULL);
4042 if (IS_ERR(op_data)) {
4043 mutex_unlock(&lli->lli_layout_mutex);
4044 RETURN(PTR_ERR(op_data));
4047 /* have to enqueue one */
4048 memset(&it, 0, sizeof(it));
4049 it.it_op = IT_LAYOUT;
4050 lockh.cookie = 0ULL;
4052 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4053 ll_get_fsname(inode->i_sb, NULL, 0),
4054 PFID(&lli->lli_fid), inode);
4056 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4057 if (it.d.lustre.it_data != NULL)
4058 ptlrpc_req_finished(it.d.lustre.it_data);
4059 it.d.lustre.it_data = NULL;
4061 ll_finish_md_op_data(op_data);
4063 mode = it.d.lustre.it_lock_mode;
4064 it.d.lustre.it_lock_mode = 0;
4065 ll_intent_drop_lock(&it);
4068 /* set lock data in case this is a new lock */
4069 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4070 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4074 mutex_unlock(&lli->lli_layout_mutex);
4080 * This function send a restore request to the MDT
4082 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4084 struct hsm_user_request *hur;
4088 len = sizeof(struct hsm_user_request) +
4089 sizeof(struct hsm_user_item);
4090 OBD_ALLOC(hur, len);
4094 hur->hur_request.hr_action = HUA_RESTORE;
4095 hur->hur_request.hr_archive_id = 0;
4096 hur->hur_request.hr_flags = 0;
4097 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4098 sizeof(hur->hur_user_item[0].hui_fid));
4099 hur->hur_user_item[0].hui_extent.offset = offset;
4100 hur->hur_user_item[0].hui_extent.length = length;
4101 hur->hur_request.hr_itemcount = 1;
4102 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,