4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
94 ll_inode_to_ext_flags(inode->i_flags);
95 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Closes the IO epoch and packs all the attributes into @op_data for
108 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
109 struct obd_client_handle *och)
113 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
114 ATTR_MTIME | ATTR_MTIME_SET |
115 ATTR_CTIME | ATTR_CTIME_SET;
117 if (!(och->och_flags & FMODE_WRITE))
120 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
121 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
123 ll_ioepoch_close(inode, op_data, &och, 0);
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
147 * XXX: in case of LMV, is this correct to access
150 CERROR("Invalid MDC connection handle "LPX64"\n",
151 ll_i2mdexp(inode)->exp_handle.h_cookie);
155 OBD_ALLOC_PTR(op_data);
157 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
159 ll_prepare_close(inode, op_data, och);
160 if (data_version != NULL) {
161 /* Pass in data_version implies release. */
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *data_version;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 /* This close must have the epoch closed. */
171 LASSERT(epoch_close);
172 /* MDS has instructed us to obtain Size-on-MDS attribute from
173 * OSTs and send setattr to back to MDS. */
174 rc = ll_som_update(inode, op_data);
176 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
177 " failed: rc = %d\n",
178 ll_i2mdexp(inode)->exp_obd->obd_name,
179 PFID(ll_inode2fid(inode)), rc);
183 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
184 ll_i2mdexp(inode)->exp_obd->obd_name,
185 PFID(ll_inode2fid(inode)), rc);
188 /* DATA_MODIFIED flag was successfully sent on close, cancel data
189 * modification flag. */
190 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
191 struct ll_inode_info *lli = ll_i2info(inode);
193 spin_lock(&lli->lli_lock);
194 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
195 spin_unlock(&lli->lli_lock);
199 rc = ll_objects_destroy(req, inode);
201 CERROR("%s: inode "DFID
202 " ll_objects destroy: rc = %d\n",
203 ll_i2mdexp(inode)->exp_obd->obd_name,
204 PFID(ll_inode2fid(inode)), rc);
207 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
208 struct mdt_body *body;
209 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
210 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
214 ll_finish_md_op_data(op_data);
218 if (exp_connect_som(exp) && !epoch_close &&
219 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
220 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
222 md_clear_open_replay_data(md_exp, och);
223 /* Free @och if it is not waiting for DONE_WRITING. */
224 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
227 if (req) /* This is close request */
228 ptlrpc_req_finished(req);
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
275 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
279 struct ll_inode_info *lli = ll_i2info(inode);
283 /* clear group lock, if present */
284 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
285 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
287 if (fd->fd_lease_och != NULL) {
290 /* Usually the lease is not released when the
291 * application crashed, we need to release here. */
292 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
293 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
294 PFID(&lli->lli_fid), rc, lease_broken);
296 fd->fd_lease_och = NULL;
299 if (fd->fd_och != NULL) {
300 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
305 /* Let's see if we have good enough OPEN lock on the file and if
306 we can skip talking to MDS */
307 if (file->f_dentry->d_inode) { /* Can this ever be false? */
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct lustre_handle lockh;
311 struct inode *inode = file->f_dentry->d_inode;
312 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
314 mutex_lock(&lli->lli_och_mutex);
315 if (fd->fd_omode & FMODE_WRITE) {
317 LASSERT(lli->lli_open_fd_write_count);
318 lli->lli_open_fd_write_count--;
319 } else if (fd->fd_omode & FMODE_EXEC) {
321 LASSERT(lli->lli_open_fd_exec_count);
322 lli->lli_open_fd_exec_count--;
325 LASSERT(lli->lli_open_fd_read_count);
326 lli->lli_open_fd_read_count--;
328 mutex_unlock(&lli->lli_och_mutex);
330 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
331 LDLM_IBITS, &policy, lockmode,
333 rc = ll_md_real_close(file->f_dentry->d_inode,
337 CERROR("released file has negative dentry: file = %p, "
338 "dentry = %p, name = %s\n",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead,
386 * because parent and child process can share the same file handle. */
387 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
388 ll_deauthorize_statahead(inode, fd);
390 if (inode->i_sb->s_root == file->f_dentry) {
391 LUSTRE_FPRIVATE(file) = NULL;
392 ll_file_data_put(fd);
396 if (!S_ISDIR(inode->i_mode)) {
397 if (lli->lli_clob != NULL)
398 lov_read_and_clear_async_rc(lli->lli_clob);
399 lli->lli_async_rc = 0;
402 rc = ll_md_close(sbi->ll_md_exp, inode, file);
404 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
405 libcfs_debug_dumplog();
410 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
411 struct lookup_intent *itp)
413 struct dentry *de = file->f_dentry;
414 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
415 struct dentry *parent = de->d_parent;
416 const char *name = NULL;
418 struct md_op_data *op_data;
419 struct ptlrpc_request *req = NULL;
423 LASSERT(parent != NULL);
424 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
426 /* if server supports open-by-fid, or file name is invalid, don't pack
427 * name in open request */
428 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
429 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
430 name = de->d_name.name;
431 len = de->d_name.len;
434 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
435 name, len, 0, LUSTRE_OPC_ANY, NULL);
437 RETURN(PTR_ERR(op_data));
438 op_data->op_data = lmm;
439 op_data->op_data_size = lmmsize;
441 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
442 &ll_md_blocking_ast, 0);
443 ll_finish_md_op_data(op_data);
445 /* reason for keep own exit path - don`t flood log
446 * with messages with -ESTALE errors.
448 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
449 it_open_error(DISP_OPEN_OPEN, itp))
451 ll_release_openhandle(de, itp);
455 if (it_disposition(itp, DISP_LOOKUP_NEG))
456 GOTO(out, rc = -ENOENT);
458 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
459 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
460 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
464 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
465 if (!rc && itp->d.lustre.it_lock_mode)
466 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
469 ptlrpc_req_finished(req);
470 ll_intent_drop_lock(itp);
476 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
477 * not believe attributes if a few ioepoch holders exist. Attributes for
478 * previous ioepoch if new one is opened are also skipped by MDS.
480 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
482 if (ioepoch && lli->lli_ioepoch != ioepoch) {
483 lli->lli_ioepoch = ioepoch;
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 ioepoch, PFID(&lli->lli_fid));
489 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
490 struct obd_client_handle *och)
492 struct ptlrpc_request *req = it->d.lustre.it_data;
493 struct mdt_body *body;
495 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
496 och->och_fh = body->mbo_handle;
497 och->och_fid = body->mbo_fid1;
498 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
499 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
500 och->och_flags = it->it_flags;
502 return md_set_open_replay_data(md_exp, och, it);
505 static int ll_local_open(struct file *file, struct lookup_intent *it,
506 struct ll_file_data *fd, struct obd_client_handle *och)
508 struct inode *inode = file->f_dentry->d_inode;
509 struct ll_inode_info *lli = ll_i2info(inode);
512 LASSERT(!LUSTRE_FPRIVATE(file));
517 struct ptlrpc_request *req = it->d.lustre.it_data;
518 struct mdt_body *body;
521 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
525 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
526 ll_ioepoch_open(lli, body->mbo_ioepoch);
529 LUSTRE_FPRIVATE(file) = fd;
530 ll_readahead_init(inode, &fd->fd_ras);
531 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
533 /* ll_cl_context initialize */
534 rwlock_init(&fd->fd_lock);
535 INIT_LIST_HEAD(&fd->fd_lccs);
540 /* Open a file, and (for the very first open) create objects on the OSTs at
541 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
542 * creation or open until ll_lov_setstripe() ioctl is called.
544 * If we already have the stripe MD locally then we don't request it in
545 * md_open(), by passing a lmm_size = 0.
547 * It is up to the application to ensure no other processes open this file
548 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
549 * used. We might be able to avoid races of that sort by getting lli_open_sem
550 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
551 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
553 int ll_file_open(struct inode *inode, struct file *file)
555 struct ll_inode_info *lli = ll_i2info(inode);
556 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
557 .it_flags = file->f_flags };
558 struct obd_client_handle **och_p = NULL;
559 __u64 *och_usecount = NULL;
560 struct ll_file_data *fd;
564 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
565 PFID(ll_inode2fid(inode)), inode, file->f_flags);
567 it = file->private_data; /* XXX: compat macro */
568 file->private_data = NULL; /* prevent ll_local_open assertion */
570 fd = ll_file_data_get();
572 GOTO(out_openerr, rc = -ENOMEM);
575 if (S_ISDIR(inode->i_mode))
576 ll_authorize_statahead(inode, fd);
578 if (inode->i_sb->s_root == file->f_dentry) {
579 LUSTRE_FPRIVATE(file) = fd;
583 if (!it || !it->d.lustre.it_disposition) {
584 /* Convert f_flags into access mode. We cannot use file->f_mode,
585 * because everything but O_ACCMODE mask was stripped from
587 if ((oit.it_flags + 1) & O_ACCMODE)
589 if (file->f_flags & O_TRUNC)
590 oit.it_flags |= FMODE_WRITE;
592 /* kernel only call f_op->open in dentry_open. filp_open calls
593 * dentry_open after call to open_namei that checks permissions.
594 * Only nfsd_open call dentry_open directly without checking
595 * permissions and because of that this code below is safe. */
596 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
597 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
599 /* We do not want O_EXCL here, presumably we opened the file
600 * already? XXX - NFS implications? */
601 oit.it_flags &= ~O_EXCL;
603 /* bug20584, if "it_flags" contains O_CREAT, the file will be
604 * created if necessary, then "IT_CREAT" should be set to keep
605 * consistent with it */
606 if (oit.it_flags & O_CREAT)
607 oit.it_op |= IT_CREAT;
613 /* Let's see if we have file open on MDS already. */
614 if (it->it_flags & FMODE_WRITE) {
615 och_p = &lli->lli_mds_write_och;
616 och_usecount = &lli->lli_open_fd_write_count;
617 } else if (it->it_flags & FMODE_EXEC) {
618 och_p = &lli->lli_mds_exec_och;
619 och_usecount = &lli->lli_open_fd_exec_count;
621 och_p = &lli->lli_mds_read_och;
622 och_usecount = &lli->lli_open_fd_read_count;
625 mutex_lock(&lli->lli_och_mutex);
626 if (*och_p) { /* Open handle is present */
627 if (it_disposition(it, DISP_OPEN_OPEN)) {
628 /* Well, there's extra open request that we do not need,
629 let's close it somehow. This will decref request. */
630 rc = it_open_error(DISP_OPEN_OPEN, it);
632 mutex_unlock(&lli->lli_och_mutex);
633 GOTO(out_openerr, rc);
636 ll_release_openhandle(file->f_dentry, it);
640 rc = ll_local_open(file, it, fd, NULL);
643 mutex_unlock(&lli->lli_och_mutex);
644 GOTO(out_openerr, rc);
647 LASSERT(*och_usecount == 0);
648 if (!it->d.lustre.it_disposition) {
649 /* We cannot just request lock handle now, new ELC code
650 means that one of other OPEN locks for this file
651 could be cancelled, and since blocking ast handler
652 would attempt to grab och_mutex as well, that would
653 result in a deadlock */
654 mutex_unlock(&lli->lli_och_mutex);
656 * Normally called under two situations:
658 * 2. A race/condition on MDS resulting in no open
659 * handle to be returned from LOOKUP|OPEN request,
660 * for example if the target entry was a symlink.
662 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
664 * Always specify MDS_OPEN_BY_FID because we don't want
665 * to get file with different fid.
667 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
668 rc = ll_intent_file_open(file, NULL, 0, it);
670 GOTO(out_openerr, rc);
674 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
676 GOTO(out_och_free, rc = -ENOMEM);
680 /* md_intent_lock() didn't get a request ref if there was an
681 * open error, so don't do cleanup on the request here
683 /* XXX (green): Should not we bail out on any error here, not
684 * just open error? */
685 rc = it_open_error(DISP_OPEN_OPEN, it);
687 GOTO(out_och_free, rc);
689 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
690 "inode %p: disposition %x, status %d\n", inode,
691 it_disposition(it, ~0), it->d.lustre.it_status);
693 rc = ll_local_open(file, it, fd, *och_p);
695 GOTO(out_och_free, rc);
697 mutex_unlock(&lli->lli_och_mutex);
700 /* Must do this outside lli_och_mutex lock to prevent deadlock where
701 different kind of OPEN lock for this same inode gets cancelled
702 by ldlm_cancel_lru */
703 if (!S_ISREG(inode->i_mode))
704 GOTO(out_och_free, rc);
708 if (!lli->lli_has_smd &&
709 (cl_is_lov_delay_create(file->f_flags) ||
710 (file->f_mode & FMODE_WRITE) == 0)) {
711 CDEBUG(D_INODE, "object creation was delayed\n");
712 GOTO(out_och_free, rc);
714 cl_lov_delay_create_clear(&file->f_flags);
715 GOTO(out_och_free, rc);
719 if (och_p && *och_p) {
720 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
721 *och_p = NULL; /* OBD_FREE writes some magic there */
724 mutex_unlock(&lli->lli_och_mutex);
727 if (lli->lli_opendir_key == fd)
728 ll_deauthorize_statahead(inode, fd);
730 ll_file_data_put(fd);
732 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
735 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
736 ptlrpc_req_finished(it->d.lustre.it_data);
737 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
743 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
744 struct ldlm_lock_desc *desc, void *data, int flag)
747 struct lustre_handle lockh;
751 case LDLM_CB_BLOCKING:
752 ldlm_lock2handle(lock, &lockh);
753 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
755 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
759 case LDLM_CB_CANCELING:
767 * Acquire a lease and open the file.
769 static struct obd_client_handle *
770 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
773 struct lookup_intent it = { .it_op = IT_OPEN };
774 struct ll_sb_info *sbi = ll_i2sbi(inode);
775 struct md_op_data *op_data;
776 struct ptlrpc_request *req = NULL;
777 struct lustre_handle old_handle = { 0 };
778 struct obd_client_handle *och = NULL;
783 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
784 RETURN(ERR_PTR(-EINVAL));
787 struct ll_inode_info *lli = ll_i2info(inode);
788 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
789 struct obd_client_handle **och_p;
792 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
793 RETURN(ERR_PTR(-EPERM));
795 /* Get the openhandle of the file */
797 mutex_lock(&lli->lli_och_mutex);
798 if (fd->fd_lease_och != NULL) {
799 mutex_unlock(&lli->lli_och_mutex);
803 if (fd->fd_och == NULL) {
804 if (file->f_mode & FMODE_WRITE) {
805 LASSERT(lli->lli_mds_write_och != NULL);
806 och_p = &lli->lli_mds_write_och;
807 och_usecount = &lli->lli_open_fd_write_count;
809 LASSERT(lli->lli_mds_read_och != NULL);
810 och_p = &lli->lli_mds_read_och;
811 och_usecount = &lli->lli_open_fd_read_count;
813 if (*och_usecount == 1) {
820 mutex_unlock(&lli->lli_och_mutex);
821 if (rc < 0) /* more than 1 opener */
824 LASSERT(fd->fd_och != NULL);
825 old_handle = fd->fd_och->och_fh;
830 RETURN(ERR_PTR(-ENOMEM));
832 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
833 LUSTRE_OPC_ANY, NULL);
835 GOTO(out, rc = PTR_ERR(op_data));
837 /* To tell the MDT this openhandle is from the same owner */
838 op_data->op_handle = old_handle;
840 it.it_flags = fmode | open_flags;
841 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
842 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
843 &ll_md_blocking_lease_ast,
844 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
845 * it can be cancelled which may mislead applications that the lease is
847 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
848 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
849 * doesn't deal with openhandle, so normal openhandle will be leaked. */
850 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
851 ll_finish_md_op_data(op_data);
852 ptlrpc_req_finished(req);
854 GOTO(out_release_it, rc);
856 if (it_disposition(&it, DISP_LOOKUP_NEG))
857 GOTO(out_release_it, rc = -ENOENT);
859 rc = it_open_error(DISP_OPEN_OPEN, &it);
861 GOTO(out_release_it, rc);
863 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
864 ll_och_fill(sbi->ll_md_exp, &it, och);
866 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
867 GOTO(out_close, rc = -EOPNOTSUPP);
869 /* already get lease, handle lease lock */
870 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
871 if (it.d.lustre.it_lock_mode == 0 ||
872 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
873 /* open lock must return for lease */
874 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
875 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
876 it.d.lustre.it_lock_bits);
877 GOTO(out_close, rc = -EPROTO);
880 ll_intent_release(&it);
884 /* Cancel open lock */
885 if (it.d.lustre.it_lock_mode != 0) {
886 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
887 it.d.lustre.it_lock_mode);
888 it.d.lustre.it_lock_mode = 0;
889 och->och_lease_handle.cookie = 0ULL;
891 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
893 CERROR("%s: error closing file "DFID": %d\n",
894 ll_get_fsname(inode->i_sb, NULL, 0),
895 PFID(&ll_i2info(inode)->lli_fid), rc2);
896 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
898 ll_intent_release(&it);
906 * Release lease and close the file.
907 * It will check if the lease has ever broken.
909 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
912 struct ldlm_lock *lock;
913 bool cancelled = true;
917 lock = ldlm_handle2lock(&och->och_lease_handle);
919 lock_res_and_lock(lock);
920 cancelled = ldlm_is_cancel(lock);
921 unlock_res_and_lock(lock);
925 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
926 PFID(&ll_i2info(inode)->lli_fid), cancelled);
929 ldlm_cli_cancel(&och->och_lease_handle, 0);
930 if (lease_broken != NULL)
931 *lease_broken = cancelled;
933 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
938 /* Fills the obdo with the attributes for the lsm */
939 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
940 struct obd_capa *capa, struct obdo *obdo,
941 __u64 ioepoch, int dv_flags)
943 struct ptlrpc_request_set *set;
944 struct obd_info oinfo = { { { 0 } } };
949 LASSERT(lsm != NULL);
953 oinfo.oi_oa->o_oi = lsm->lsm_oi;
954 oinfo.oi_oa->o_mode = S_IFREG;
955 oinfo.oi_oa->o_ioepoch = ioepoch;
956 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
957 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
958 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
959 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
960 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
961 OBD_MD_FLDATAVERSION;
962 oinfo.oi_capa = capa;
963 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
964 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
965 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
966 if (dv_flags & LL_DV_WR_FLUSH)
967 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
970 set = ptlrpc_prep_set();
972 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
975 rc = obd_getattr_async(exp, &oinfo, set);
977 rc = ptlrpc_set_wait(set);
978 ptlrpc_set_destroy(set);
981 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
982 OBD_MD_FLATIME | OBD_MD_FLMTIME |
983 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
984 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
985 if (dv_flags & LL_DV_WR_FLUSH &&
986 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
987 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
994 * Performs the getattr on the inode and updates its fields.
995 * If @sync != 0, perform the getattr under the server-side lock.
997 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
998 __u64 ioepoch, int sync)
1000 struct obd_capa *capa = ll_mdscapa_get(inode);
1001 struct lov_stripe_md *lsm;
1005 lsm = ccc_inode_lsm_get(inode);
1006 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1007 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1010 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1012 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1013 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1014 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1015 (unsigned long long)inode->i_blocks,
1016 1UL << inode->i_blkbits);
1018 ccc_inode_lsm_put(inode, lsm);
1022 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1024 struct ll_inode_info *lli = ll_i2info(inode);
1025 struct cl_object *obj = lli->lli_clob;
1026 struct cl_attr *attr = ccc_env_thread_attr(env);
1032 ll_inode_size_lock(inode);
1033 /* merge timestamps the most recently obtained from mds with
1034 timestamps obtained from osts */
1035 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1036 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1037 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 lvb.lvb_size = i_size_read(inode);
1040 lvb.lvb_blocks = inode->i_blocks;
1041 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1042 lvb.lvb_atime = LTIME_S(inode->i_atime);
1043 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1045 cl_object_attr_lock(obj);
1046 rc = cl_object_attr_get(env, obj, attr);
1047 cl_object_attr_unlock(obj);
1050 if (lvb.lvb_atime < attr->cat_atime)
1051 lvb.lvb_atime = attr->cat_atime;
1052 if (lvb.lvb_ctime < attr->cat_ctime)
1053 lvb.lvb_ctime = attr->cat_ctime;
1054 if (lvb.lvb_mtime < attr->cat_mtime)
1055 lvb.lvb_mtime = attr->cat_mtime;
1057 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1058 PFID(&lli->lli_fid), attr->cat_size);
1059 cl_isize_write_nolock(inode, attr->cat_size);
1061 inode->i_blocks = attr->cat_blocks;
1063 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1064 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1065 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1067 ll_inode_size_unlock(inode);
1072 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1075 struct obdo obdo = { 0 };
1078 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1080 st->st_size = obdo.o_size;
1081 st->st_blocks = obdo.o_blocks;
1082 st->st_mtime = obdo.o_mtime;
1083 st->st_atime = obdo.o_atime;
1084 st->st_ctime = obdo.o_ctime;
1089 static bool file_is_noatime(const struct file *file)
1091 const struct vfsmount *mnt = file->f_path.mnt;
1092 const struct inode *inode = file->f_path.dentry->d_inode;
1094 /* Adapted from file_accessed() and touch_atime().*/
1095 if (file->f_flags & O_NOATIME)
1098 if (inode->i_flags & S_NOATIME)
1101 if (IS_NOATIME(inode))
1104 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1107 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1110 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1116 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1118 struct inode *inode = file->f_dentry->d_inode;
1120 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1122 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1123 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1124 file->f_flags & O_DIRECT ||
1127 io->ci_obj = ll_i2info(inode)->lli_clob;
1128 io->ci_lockreq = CILR_MAYBE;
1129 if (ll_file_nolock(file)) {
1130 io->ci_lockreq = CILR_NEVER;
1131 io->ci_no_srvlock = 1;
1132 } else if (file->f_flags & O_APPEND) {
1133 io->ci_lockreq = CILR_MANDATORY;
1136 io->ci_noatime = file_is_noatime(file);
1140 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1141 struct file *file, enum cl_io_type iot,
1142 loff_t *ppos, size_t count)
1144 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1145 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 struct range_lock range;
1151 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1152 file->f_dentry->d_name.name, iot, *ppos, count);
1155 io = ccc_env_thread_io(env);
1156 ll_io_init(io, file, iot == CIT_WRITE);
1158 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1159 struct vvp_io *vio = vvp_env_io(env);
1160 struct ccc_io *cio = ccc_env_io(env);
1161 bool range_locked = false;
1163 if (file->f_flags & O_APPEND)
1164 range_lock_init(&range, 0, LUSTRE_EOF);
1166 range_lock_init(&range, *ppos, *ppos + count - 1);
1167 cio->cui_fd = LUSTRE_FPRIVATE(file);
1168 vio->cui_io_subtype = args->via_io_subtype;
1170 switch (vio->cui_io_subtype) {
1172 cio->cui_iov = args->u.normal.via_iov;
1173 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1174 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1175 cio->cui_iocb = args->u.normal.via_iocb;
1176 if ((iot == CIT_WRITE) &&
1177 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1178 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1180 result = range_lock(&lli->lli_write_tree,
1185 range_locked = true;
1187 down_read(&lli->lli_trunc_sem);
1190 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1191 vio->u.splice.cui_flags = args->u.splice.via_flags;
1194 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1198 ll_cl_add(file, env, io);
1199 result = cl_io_loop(env, io);
1200 ll_cl_remove(file, env);
1202 if (args->via_io_subtype == IO_NORMAL)
1203 up_read(&lli->lli_trunc_sem);
1205 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1207 range_unlock(&lli->lli_write_tree, &range);
1210 /* cl_io_rw_init() handled IO */
1211 result = io->ci_result;
1214 if (io->ci_nob > 0) {
1215 result = io->ci_nob;
1216 *ppos = io->u.ci_wr.wr.crw_pos;
1220 cl_io_fini(env, io);
1221 /* If any bit been read/written (result != 0), we just return
1222 * short read/write instead of restart io. */
1223 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1224 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1225 iot == CIT_READ ? "read" : "write",
1226 file->f_dentry->d_name.name, *ppos, count);
1227 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1231 if (iot == CIT_READ) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_READ_BYTES, result);
1235 } else if (iot == CIT_WRITE) {
1237 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1238 LPROC_LL_WRITE_BYTES, result);
1239 fd->fd_write_failed = false;
1240 } else if (result != -ERESTARTSYS) {
1241 fd->fd_write_failed = true;
1244 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1251 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1253 static int ll_file_get_iov_count(const struct iovec *iov,
1254 unsigned long *nr_segs, size_t *count)
1259 for (seg = 0; seg < *nr_segs; seg++) {
1260 const struct iovec *iv = &iov[seg];
1263 * If any segment has a negative length, or the cumulative
1264 * length ever wraps negative then return -EINVAL.
1267 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1269 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1274 cnt -= iv->iov_len; /* This segment is no good */
1281 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1282 unsigned long nr_segs, loff_t pos)
1285 struct vvp_io_args *args;
1291 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1295 env = cl_env_get(&refcheck);
1297 RETURN(PTR_ERR(env));
1299 args = vvp_env_args(env, IO_NORMAL);
1300 args->u.normal.via_iov = (struct iovec *)iov;
1301 args->u.normal.via_nrsegs = nr_segs;
1302 args->u.normal.via_iocb = iocb;
1304 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1305 &iocb->ki_pos, count);
1306 cl_env_put(env, &refcheck);
1310 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1314 struct iovec *local_iov;
1315 struct kiocb *kiocb;
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1324 local_iov = &vvp_env_info(env)->vti_local_iov;
1325 kiocb = &vvp_env_info(env)->vti_kiocb;
1326 local_iov->iov_base = (void __user *)buf;
1327 local_iov->iov_len = count;
1328 init_sync_kiocb(kiocb, file);
1329 kiocb->ki_pos = *ppos;
1330 #ifdef HAVE_KIOCB_KI_LEFT
1331 kiocb->ki_left = count;
1333 kiocb->ki_nbytes = count;
1336 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1337 *ppos = kiocb->ki_pos;
1339 cl_env_put(env, &refcheck);
1344 * Write to a file (through the page cache).
1347 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1348 unsigned long nr_segs, loff_t pos)
1351 struct vvp_io_args *args;
1357 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1361 env = cl_env_get(&refcheck);
1363 RETURN(PTR_ERR(env));
1365 args = vvp_env_args(env, IO_NORMAL);
1366 args->u.normal.via_iov = (struct iovec *)iov;
1367 args->u.normal.via_nrsegs = nr_segs;
1368 args->u.normal.via_iocb = iocb;
1370 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1371 &iocb->ki_pos, count);
1372 cl_env_put(env, &refcheck);
1376 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1377 size_t count, loff_t *ppos)
1380 struct iovec *local_iov;
1381 struct kiocb *kiocb;
1386 env = cl_env_get(&refcheck);
1388 RETURN(PTR_ERR(env));
1390 local_iov = &vvp_env_info(env)->vti_local_iov;
1391 kiocb = &vvp_env_info(env)->vti_kiocb;
1392 local_iov->iov_base = (void __user *)buf;
1393 local_iov->iov_len = count;
1394 init_sync_kiocb(kiocb, file);
1395 kiocb->ki_pos = *ppos;
1396 #ifdef HAVE_KIOCB_KI_LEFT
1397 kiocb->ki_left = count;
1399 kiocb->ki_nbytes = count;
1402 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1403 *ppos = kiocb->ki_pos;
1405 cl_env_put(env, &refcheck);
1410 * Send file content (through pagecache) somewhere with helper
1412 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1413 struct pipe_inode_info *pipe, size_t count,
1417 struct vvp_io_args *args;
1422 env = cl_env_get(&refcheck);
1424 RETURN(PTR_ERR(env));
1426 args = vvp_env_args(env, IO_SPLICE);
1427 args->u.splice.via_pipe = pipe;
1428 args->u.splice.via_flags = flags;
1430 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1431 cl_env_put(env, &refcheck);
1435 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1438 struct obd_export *exp = ll_i2dtexp(inode);
1439 struct obd_trans_info oti = { 0 };
1440 struct obdo *oa = NULL;
1443 struct lov_stripe_md *lsm = NULL, *lsm2;
1450 lsm = ccc_inode_lsm_get(inode);
1451 if (!lsm_has_objects(lsm))
1452 GOTO(out, rc = -ENOENT);
1454 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1455 (lsm->lsm_stripe_count));
1457 OBD_ALLOC_LARGE(lsm2, lsm_size);
1459 GOTO(out, rc = -ENOMEM);
1462 oa->o_nlink = ost_idx;
1463 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1464 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1465 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1466 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1467 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1468 memcpy(lsm2, lsm, lsm_size);
1469 ll_inode_size_lock(inode);
1470 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1471 ll_inode_size_unlock(inode);
1473 OBD_FREE_LARGE(lsm2, lsm_size);
1476 ccc_inode_lsm_put(inode, lsm);
1481 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1483 struct ll_recreate_obj ucreat;
1487 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1490 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1494 ostid_set_seq_mdt0(&oi);
1495 ostid_set_id(&oi, ucreat.lrc_id);
1496 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1499 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1506 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1509 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1512 fid_to_ostid(&fid, &oi);
1513 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1514 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1517 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1518 __u64 flags, struct lov_user_md *lum,
1521 struct lov_stripe_md *lsm = NULL;
1522 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1526 lsm = ccc_inode_lsm_get(inode);
1528 ccc_inode_lsm_put(inode, lsm);
1529 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1530 PFID(ll_inode2fid(inode)));
1531 GOTO(out, rc = -EEXIST);
1534 ll_inode_size_lock(inode);
1535 oit.it_flags |= MDS_OPEN_BY_FID;
1536 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1538 GOTO(out_unlock, rc);
1539 rc = oit.d.lustre.it_status;
1541 GOTO(out_req_free, rc);
1543 ll_release_openhandle(file->f_dentry, &oit);
1546 ll_inode_size_unlock(inode);
1547 ll_intent_release(&oit);
1548 ccc_inode_lsm_put(inode, lsm);
1550 cl_lov_delay_create_clear(&file->f_flags);
1553 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1557 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1558 struct lov_mds_md **lmmp, int *lmm_size,
1559 struct ptlrpc_request **request)
1561 struct ll_sb_info *sbi = ll_i2sbi(inode);
1562 struct mdt_body *body;
1563 struct lov_mds_md *lmm = NULL;
1564 struct ptlrpc_request *req = NULL;
1565 struct md_op_data *op_data;
1568 rc = ll_get_default_mdsize(sbi, &lmmsize);
1572 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1573 strlen(filename), lmmsize,
1574 LUSTRE_OPC_ANY, NULL);
1575 if (IS_ERR(op_data))
1576 RETURN(PTR_ERR(op_data));
1578 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1579 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1580 ll_finish_md_op_data(op_data);
1582 CDEBUG(D_INFO, "md_getattr_name failed "
1583 "on %s: rc %d\n", filename, rc);
1587 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1588 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1590 lmmsize = body->mbo_eadatasize;
1592 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1594 GOTO(out, rc = -ENODATA);
1597 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1598 LASSERT(lmm != NULL);
1600 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1601 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1602 GOTO(out, rc = -EPROTO);
1606 * This is coming from the MDS, so is probably in
1607 * little endian. We convert it to host endian before
1608 * passing it to userspace.
1610 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1613 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1614 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1617 /* if function called for directory - we should
1618 * avoid swab not existent lsm objects */
1619 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1620 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1621 if (S_ISREG(body->mbo_mode))
1622 lustre_swab_lov_user_md_objects(
1623 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1625 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1626 lustre_swab_lov_user_md_v3(
1627 (struct lov_user_md_v3 *)lmm);
1628 if (S_ISREG(body->mbo_mode))
1629 lustre_swab_lov_user_md_objects(
1630 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1637 *lmm_size = lmmsize;
1642 static int ll_lov_setea(struct inode *inode, struct file *file,
1645 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1646 struct lov_user_md *lump;
1647 int lum_size = sizeof(struct lov_user_md) +
1648 sizeof(struct lov_user_ost_data);
1652 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1655 OBD_ALLOC_LARGE(lump, lum_size);
1659 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1660 OBD_FREE_LARGE(lump, lum_size);
1664 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1666 OBD_FREE_LARGE(lump, lum_size);
1670 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1673 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1674 struct lov_user_md *klum;
1676 __u64 flags = FMODE_WRITE;
1679 rc = ll_copy_user_md(lum, &klum);
1684 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1686 struct lov_stripe_md *lsm;
1689 put_user(0, &lum->lmm_stripe_count);
1691 ll_layout_refresh(inode, &gen);
1692 lsm = ccc_inode_lsm_get(inode);
1693 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1695 ccc_inode_lsm_put(inode, lsm);
1698 OBD_FREE(klum, lum_size);
1702 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1704 struct lov_stripe_md *lsm;
1708 lsm = ccc_inode_lsm_get(inode);
1710 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1711 lsm, (void __user *)arg);
1712 ccc_inode_lsm_put(inode, lsm);
1717 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1719 struct ll_inode_info *lli = ll_i2info(inode);
1720 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1721 struct ccc_grouplock grouplock;
1725 if (ll_file_nolock(file))
1726 RETURN(-EOPNOTSUPP);
1728 spin_lock(&lli->lli_lock);
1729 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1730 CWARN("group lock already existed with gid %lu\n",
1731 fd->fd_grouplock.cg_gid);
1732 spin_unlock(&lli->lli_lock);
1735 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1736 spin_unlock(&lli->lli_lock);
1738 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1739 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1743 spin_lock(&lli->lli_lock);
1744 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1745 spin_unlock(&lli->lli_lock);
1746 CERROR("another thread just won the race\n");
1747 cl_put_grouplock(&grouplock);
1751 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1752 fd->fd_grouplock = grouplock;
1753 spin_unlock(&lli->lli_lock);
1755 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1759 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1761 struct ll_inode_info *lli = ll_i2info(inode);
1762 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1763 struct ccc_grouplock grouplock;
1766 spin_lock(&lli->lli_lock);
1767 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1768 spin_unlock(&lli->lli_lock);
1769 CWARN("no group lock held\n");
1772 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1774 if (fd->fd_grouplock.cg_gid != arg) {
1775 CWARN("group lock %lu doesn't match current id %lu\n",
1776 arg, fd->fd_grouplock.cg_gid);
1777 spin_unlock(&lli->lli_lock);
1781 grouplock = fd->fd_grouplock;
1782 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1783 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1784 spin_unlock(&lli->lli_lock);
1786 cl_put_grouplock(&grouplock);
1787 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1792 * Close inode open handle
1794 * \param dentry [in] dentry which contains the inode
1795 * \param it [in,out] intent which contains open info and result
1798 * \retval <0 failure
1800 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1802 struct inode *inode = dentry->d_inode;
1803 struct obd_client_handle *och;
1809 /* Root ? Do nothing. */
1810 if (dentry->d_inode->i_sb->s_root == dentry)
1813 /* No open handle to close? Move away */
1814 if (!it_disposition(it, DISP_OPEN_OPEN))
1817 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1819 OBD_ALLOC(och, sizeof(*och));
1821 GOTO(out, rc = -ENOMEM);
1823 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1825 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1828 /* this one is in place of ll_file_open */
1829 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1830 ptlrpc_req_finished(it->d.lustre.it_data);
1831 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1837 * Get size for inode for which FIEMAP mapping is requested.
1838 * Make the FIEMAP get_info call and returns the result.
1840 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1843 struct obd_export *exp = ll_i2dtexp(inode);
1844 struct lov_stripe_md *lsm = NULL;
1845 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1846 __u32 vallen = num_bytes;
1850 /* Checks for fiemap flags */
1851 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1852 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1856 /* Check for FIEMAP_FLAG_SYNC */
1857 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1858 rc = filemap_fdatawrite(inode->i_mapping);
1863 lsm = ccc_inode_lsm_get(inode);
1867 /* If the stripe_count > 1 and the application does not understand
1868 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1870 if (lsm->lsm_stripe_count > 1 &&
1871 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1872 GOTO(out, rc = -EOPNOTSUPP);
1874 fm_key.oa.o_oi = lsm->lsm_oi;
1875 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1877 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1878 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1879 /* If filesize is 0, then there would be no objects for mapping */
1880 if (fm_key.oa.o_size == 0) {
1881 fiemap->fm_mapped_extents = 0;
1885 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1887 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1890 CERROR("obd_get_info failed: rc = %d\n", rc);
1893 ccc_inode_lsm_put(inode, lsm);
1897 int ll_fid2path(struct inode *inode, void __user *arg)
1899 struct obd_export *exp = ll_i2mdexp(inode);
1900 const struct getinfo_fid2path __user *gfin = arg;
1902 struct getinfo_fid2path *gfout;
1908 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1909 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1912 /* Only need to get the buflen */
1913 if (get_user(pathlen, &gfin->gf_pathlen))
1916 if (pathlen > PATH_MAX)
1919 outsize = sizeof(*gfout) + pathlen;
1920 OBD_ALLOC(gfout, outsize);
1924 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1925 GOTO(gf_free, rc = -EFAULT);
1927 /* Call mdc_iocontrol */
1928 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1932 if (copy_to_user(arg, gfout, outsize))
1936 OBD_FREE(gfout, outsize);
1940 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1942 struct ll_user_fiemap *fiemap_s;
1943 size_t num_bytes, ret_bytes;
1944 unsigned int extent_count;
1947 /* Get the extent count so we can calculate the size of
1948 * required fiemap buffer */
1949 if (get_user(extent_count,
1950 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1954 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1956 num_bytes = sizeof(*fiemap_s) + (extent_count *
1957 sizeof(struct ll_fiemap_extent));
1959 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1960 if (fiemap_s == NULL)
1963 /* get the fiemap value */
1964 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1966 GOTO(error, rc = -EFAULT);
1968 /* If fm_extent_count is non-zero, read the first extent since
1969 * it is used to calculate end_offset and device from previous
1972 if (copy_from_user(&fiemap_s->fm_extents[0],
1973 (char __user *)arg + sizeof(*fiemap_s),
1974 sizeof(struct ll_fiemap_extent)))
1975 GOTO(error, rc = -EFAULT);
1978 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1982 ret_bytes = sizeof(struct ll_user_fiemap);
1984 if (extent_count != 0)
1985 ret_bytes += (fiemap_s->fm_mapped_extents *
1986 sizeof(struct ll_fiemap_extent));
1988 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1992 OBD_FREE_LARGE(fiemap_s, num_bytes);
1997 * Read the data_version for inode.
1999 * This value is computed using stripe object version on OST.
2000 * Version is computed using server side locking.
2002 * @param sync if do sync on the OST side;
2004 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2005 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2007 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2009 struct lov_stripe_md *lsm = NULL;
2010 struct ll_sb_info *sbi = ll_i2sbi(inode);
2011 struct obdo *obdo = NULL;
2015 /* If no stripe, we consider version is 0. */
2016 lsm = ccc_inode_lsm_get(inode);
2017 if (!lsm_has_objects(lsm)) {
2019 CDEBUG(D_INODE, "No object for inode\n");
2023 OBD_ALLOC_PTR(obdo);
2025 GOTO(out, rc = -ENOMEM);
2027 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2029 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2032 *data_version = obdo->o_data_version;
2038 ccc_inode_lsm_put(inode, lsm);
2043 * Trigger a HSM release request for the provided inode.
2045 int ll_hsm_release(struct inode *inode)
2047 struct cl_env_nest nest;
2049 struct obd_client_handle *och = NULL;
2050 __u64 data_version = 0;
2054 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2055 ll_get_fsname(inode->i_sb, NULL, 0),
2056 PFID(&ll_i2info(inode)->lli_fid));
2058 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2060 GOTO(out, rc = PTR_ERR(och));
2062 /* Grab latest data_version and [am]time values */
2063 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2067 env = cl_env_nested_get(&nest);
2069 GOTO(out, rc = PTR_ERR(env));
2071 ll_merge_lvb(env, inode);
2072 cl_env_nested_put(&nest, env);
2074 /* Release the file.
2075 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2076 * we still need it to pack l_remote_handle to MDT. */
2077 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2083 if (och != NULL && !IS_ERR(och)) /* close the file */
2084 ll_lease_close(och, inode, NULL);
2089 struct ll_swap_stack {
2090 struct iattr ia1, ia2;
2092 struct inode *inode1, *inode2;
2093 bool check_dv1, check_dv2;
2096 static int ll_swap_layouts(struct file *file1, struct file *file2,
2097 struct lustre_swap_layouts *lsl)
2099 struct mdc_swap_layouts msl;
2100 struct md_op_data *op_data;
2103 struct ll_swap_stack *llss = NULL;
2106 OBD_ALLOC_PTR(llss);
2110 llss->inode1 = file1->f_dentry->d_inode;
2111 llss->inode2 = file2->f_dentry->d_inode;
2113 if (!S_ISREG(llss->inode2->i_mode))
2114 GOTO(free, rc = -EINVAL);
2116 if (inode_permission(llss->inode1, MAY_WRITE) ||
2117 inode_permission(llss->inode2, MAY_WRITE))
2118 GOTO(free, rc = -EPERM);
2120 if (llss->inode2->i_sb != llss->inode1->i_sb)
2121 GOTO(free, rc = -EXDEV);
2123 /* we use 2 bool because it is easier to swap than 2 bits */
2124 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2125 llss->check_dv1 = true;
2127 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2128 llss->check_dv2 = true;
2130 /* we cannot use lsl->sl_dvX directly because we may swap them */
2131 llss->dv1 = lsl->sl_dv1;
2132 llss->dv2 = lsl->sl_dv2;
2134 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2135 if (rc == 0) /* same file, done! */
2138 if (rc < 0) { /* sequentialize it */
2139 swap(llss->inode1, llss->inode2);
2141 swap(llss->dv1, llss->dv2);
2142 swap(llss->check_dv1, llss->check_dv2);
2146 if (gid != 0) { /* application asks to flush dirty cache */
2147 rc = ll_get_grouplock(llss->inode1, file1, gid);
2151 rc = ll_get_grouplock(llss->inode2, file2, gid);
2153 ll_put_grouplock(llss->inode1, file1, gid);
2158 /* to be able to restore mtime and atime after swap
2159 * we need to first save them */
2161 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2162 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2163 llss->ia1.ia_atime = llss->inode1->i_atime;
2164 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2165 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2166 llss->ia2.ia_atime = llss->inode2->i_atime;
2167 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2170 /* ultimate check, before swaping the layouts we check if
2171 * dataversion has changed (if requested) */
2172 if (llss->check_dv1) {
2173 rc = ll_data_version(llss->inode1, &dv, 0);
2176 if (dv != llss->dv1)
2177 GOTO(putgl, rc = -EAGAIN);
2180 if (llss->check_dv2) {
2181 rc = ll_data_version(llss->inode2, &dv, 0);
2184 if (dv != llss->dv2)
2185 GOTO(putgl, rc = -EAGAIN);
2188 /* struct md_op_data is used to send the swap args to the mdt
2189 * only flags is missing, so we use struct mdc_swap_layouts
2190 * through the md_op_data->op_data */
2191 /* flags from user space have to be converted before they are send to
2192 * server, no flag is sent today, they are only used on the client */
2195 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2196 0, LUSTRE_OPC_ANY, &msl);
2197 if (IS_ERR(op_data))
2198 GOTO(free, rc = PTR_ERR(op_data));
2200 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2201 sizeof(*op_data), op_data, NULL);
2202 ll_finish_md_op_data(op_data);
2206 ll_put_grouplock(llss->inode2, file2, gid);
2207 ll_put_grouplock(llss->inode1, file1, gid);
2210 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2214 /* clear useless flags */
2215 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2216 llss->ia1.ia_valid &= ~ATTR_MTIME;
2217 llss->ia2.ia_valid &= ~ATTR_MTIME;
2220 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2221 llss->ia1.ia_valid &= ~ATTR_ATIME;
2222 llss->ia2.ia_valid &= ~ATTR_ATIME;
2225 /* update time if requested */
2227 if (llss->ia2.ia_valid != 0) {
2228 mutex_lock(&llss->inode1->i_mutex);
2229 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2230 mutex_unlock(&llss->inode1->i_mutex);
2233 if (llss->ia1.ia_valid != 0) {
2236 mutex_lock(&llss->inode2->i_mutex);
2237 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2238 mutex_unlock(&llss->inode2->i_mutex);
2250 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2252 struct md_op_data *op_data;
2255 /* Non-root users are forbidden to set or clear flags which are
2256 * NOT defined in HSM_USER_MASK. */
2257 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2258 !cfs_capable(CFS_CAP_SYS_ADMIN))
2261 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2262 LUSTRE_OPC_ANY, hss);
2263 if (IS_ERR(op_data))
2264 RETURN(PTR_ERR(op_data));
2266 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2267 sizeof(*op_data), op_data, NULL);
2269 ll_finish_md_op_data(op_data);
2274 static int ll_hsm_import(struct inode *inode, struct file *file,
2275 struct hsm_user_import *hui)
2277 struct hsm_state_set *hss = NULL;
2278 struct iattr *attr = NULL;
2282 if (!S_ISREG(inode->i_mode))
2288 GOTO(out, rc = -ENOMEM);
2290 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2291 hss->hss_archive_id = hui->hui_archive_id;
2292 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2293 rc = ll_hsm_state_set(inode, hss);
2297 OBD_ALLOC_PTR(attr);
2299 GOTO(out, rc = -ENOMEM);
2301 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2302 attr->ia_mode |= S_IFREG;
2303 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2304 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2305 attr->ia_size = hui->hui_size;
2306 attr->ia_mtime.tv_sec = hui->hui_mtime;
2307 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2308 attr->ia_atime.tv_sec = hui->hui_atime;
2309 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2311 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2312 ATTR_UID | ATTR_GID |
2313 ATTR_MTIME | ATTR_MTIME_SET |
2314 ATTR_ATIME | ATTR_ATIME_SET;
2316 mutex_lock(&inode->i_mutex);
2318 rc = ll_setattr_raw(file->f_dentry, attr, true);
2322 mutex_unlock(&inode->i_mutex);
2334 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2336 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2337 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2341 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2343 struct inode *inode = file->f_dentry->d_inode;
2344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2348 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2349 PFID(ll_inode2fid(inode)), inode, cmd);
2350 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2352 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2353 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2357 case LL_IOC_GETFLAGS:
2358 /* Get the current value of the file flags */
2359 return put_user(fd->fd_flags, (int __user *)arg);
2360 case LL_IOC_SETFLAGS:
2361 case LL_IOC_CLRFLAGS:
2362 /* Set or clear specific file flags */
2363 /* XXX This probably needs checks to ensure the flags are
2364 * not abused, and to handle any flag side effects.
2366 if (get_user(flags, (int __user *) arg))
2369 if (cmd == LL_IOC_SETFLAGS) {
2370 if ((flags & LL_FILE_IGNORE_LOCK) &&
2371 !(file->f_flags & O_DIRECT)) {
2372 CERROR("%s: unable to disable locking on "
2373 "non-O_DIRECT file\n", current->comm);
2377 fd->fd_flags |= flags;
2379 fd->fd_flags &= ~flags;
2382 case LL_IOC_LOV_SETSTRIPE:
2383 RETURN(ll_lov_setstripe(inode, file, arg));
2384 case LL_IOC_LOV_SETEA:
2385 RETURN(ll_lov_setea(inode, file, arg));
2386 case LL_IOC_LOV_SWAP_LAYOUTS: {
2388 struct lustre_swap_layouts lsl;
2390 if (copy_from_user(&lsl, (char __user *)arg,
2391 sizeof(struct lustre_swap_layouts)))
2394 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2397 file2 = fget(lsl.sl_fd);
2402 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2403 rc = ll_swap_layouts(file, file2, &lsl);
2407 case LL_IOC_LOV_GETSTRIPE:
2408 RETURN(ll_lov_getstripe(inode, arg));
2409 case LL_IOC_RECREATE_OBJ:
2410 RETURN(ll_lov_recreate_obj(inode, arg));
2411 case LL_IOC_RECREATE_FID:
2412 RETURN(ll_lov_recreate_fid(inode, arg));
2413 case FSFILT_IOC_FIEMAP:
2414 RETURN(ll_ioctl_fiemap(inode, arg));
2415 case FSFILT_IOC_GETFLAGS:
2416 case FSFILT_IOC_SETFLAGS:
2417 RETURN(ll_iocontrol(inode, file, cmd, arg));
2418 case FSFILT_IOC_GETVERSION_OLD:
2419 case FSFILT_IOC_GETVERSION:
2420 RETURN(put_user(inode->i_generation, (int __user *)arg));
2421 case LL_IOC_GROUP_LOCK:
2422 RETURN(ll_get_grouplock(inode, file, arg));
2423 case LL_IOC_GROUP_UNLOCK:
2424 RETURN(ll_put_grouplock(inode, file, arg));
2425 case IOC_OBD_STATFS:
2426 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2428 /* We need to special case any other ioctls we want to handle,
2429 * to send them to the MDS/OST as appropriate and to properly
2430 * network encode the arg field.
2431 case FSFILT_IOC_SETVERSION_OLD:
2432 case FSFILT_IOC_SETVERSION:
2434 case LL_IOC_FLUSHCTX:
2435 RETURN(ll_flush_ctx(inode));
2436 case LL_IOC_PATH2FID: {
2437 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2438 sizeof(struct lu_fid)))
2443 case LL_IOC_GETPARENT:
2444 RETURN(ll_getparent(file, (void __user *)arg));
2446 case OBD_IOC_FID2PATH:
2447 RETURN(ll_fid2path(inode, (void __user *)arg));
2448 case LL_IOC_DATA_VERSION: {
2449 struct ioc_data_version idv;
2452 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2455 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2456 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2459 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2465 case LL_IOC_GET_MDTIDX: {
2468 mdtidx = ll_get_mdt_idx(inode);
2472 if (put_user((int)mdtidx, (int __user *)arg))
2477 case OBD_IOC_GETDTNAME:
2478 case OBD_IOC_GETMDNAME:
2479 RETURN(ll_get_obd_name(inode, cmd, arg));
2480 case LL_IOC_HSM_STATE_GET: {
2481 struct md_op_data *op_data;
2482 struct hsm_user_state *hus;
2489 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2490 LUSTRE_OPC_ANY, hus);
2491 if (IS_ERR(op_data)) {
2493 RETURN(PTR_ERR(op_data));
2496 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2499 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2502 ll_finish_md_op_data(op_data);
2506 case LL_IOC_HSM_STATE_SET: {
2507 struct hsm_state_set *hss;
2514 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2519 rc = ll_hsm_state_set(inode, hss);
2524 case LL_IOC_HSM_ACTION: {
2525 struct md_op_data *op_data;
2526 struct hsm_current_action *hca;
2533 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2534 LUSTRE_OPC_ANY, hca);
2535 if (IS_ERR(op_data)) {
2537 RETURN(PTR_ERR(op_data));
2540 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2543 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2546 ll_finish_md_op_data(op_data);
2550 case LL_IOC_SET_LEASE: {
2551 struct ll_inode_info *lli = ll_i2info(inode);
2552 struct obd_client_handle *och = NULL;
2557 case LL_LEASE_WRLCK:
2558 if (!(file->f_mode & FMODE_WRITE))
2560 fmode = FMODE_WRITE;
2562 case LL_LEASE_RDLCK:
2563 if (!(file->f_mode & FMODE_READ))
2567 case LL_LEASE_UNLCK:
2568 mutex_lock(&lli->lli_och_mutex);
2569 if (fd->fd_lease_och != NULL) {
2570 och = fd->fd_lease_och;
2571 fd->fd_lease_och = NULL;
2573 mutex_unlock(&lli->lli_och_mutex);
2578 fmode = och->och_flags;
2579 rc = ll_lease_close(och, inode, &lease_broken);
2586 RETURN(ll_lease_type_from_fmode(fmode));
2591 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2593 /* apply for lease */
2594 och = ll_lease_open(inode, file, fmode, 0);
2596 RETURN(PTR_ERR(och));
2599 mutex_lock(&lli->lli_och_mutex);
2600 if (fd->fd_lease_och == NULL) {
2601 fd->fd_lease_och = och;
2604 mutex_unlock(&lli->lli_och_mutex);
2606 /* impossible now that only excl is supported for now */
2607 ll_lease_close(och, inode, &lease_broken);
2612 case LL_IOC_GET_LEASE: {
2613 struct ll_inode_info *lli = ll_i2info(inode);
2614 struct ldlm_lock *lock = NULL;
2617 mutex_lock(&lli->lli_och_mutex);
2618 if (fd->fd_lease_och != NULL) {
2619 struct obd_client_handle *och = fd->fd_lease_och;
2621 lock = ldlm_handle2lock(&och->och_lease_handle);
2623 lock_res_and_lock(lock);
2624 if (!ldlm_is_cancel(lock))
2625 fmode = och->och_flags;
2627 unlock_res_and_lock(lock);
2628 LDLM_LOCK_PUT(lock);
2631 mutex_unlock(&lli->lli_och_mutex);
2633 RETURN(ll_lease_type_from_fmode(fmode));
2635 case LL_IOC_HSM_IMPORT: {
2636 struct hsm_user_import *hui;
2642 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2647 rc = ll_hsm_import(inode, file, hui);
2657 ll_iocontrol_call(inode, file, cmd, arg, &err))
2660 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2661 (void __user *)arg));
2666 #ifndef HAVE_FILE_LLSEEK_SIZE
2667 static inline loff_t
2668 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2670 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2672 if (offset > maxsize)
2675 if (offset != file->f_pos) {
2676 file->f_pos = offset;
2677 file->f_version = 0;
2683 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2684 loff_t maxsize, loff_t eof)
2686 struct inode *inode = file->f_dentry->d_inode;
2694 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2695 * position-querying operation. Avoid rewriting the "same"
2696 * f_pos value back to the file because a concurrent read(),
2697 * write() or lseek() might have altered it
2702 * f_lock protects against read/modify/write race with other
2703 * SEEK_CURs. Note that parallel writes and reads behave
2706 mutex_lock(&inode->i_mutex);
2707 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2708 mutex_unlock(&inode->i_mutex);
2712 * In the generic case the entire file is data, so as long as
2713 * offset isn't at the end of the file then the offset is data.
2720 * There is a virtual hole at the end of the file, so as long as
2721 * offset isn't i_size or larger, return i_size.
2729 return llseek_execute(file, offset, maxsize);
2733 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2735 struct inode *inode = file->f_dentry->d_inode;
2736 loff_t retval, eof = 0;
2739 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2740 (origin == SEEK_CUR) ? file->f_pos : 0);
2741 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2742 PFID(ll_inode2fid(inode)), inode, retval, retval,
2744 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2746 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2747 retval = ll_glimpse_size(inode);
2750 eof = i_size_read(inode);
2753 retval = ll_generic_file_llseek_size(file, offset, origin,
2754 ll_file_maxbytes(inode), eof);
2758 static int ll_flush(struct file *file, fl_owner_t id)
2760 struct inode *inode = file->f_dentry->d_inode;
2761 struct ll_inode_info *lli = ll_i2info(inode);
2762 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2765 LASSERT(!S_ISDIR(inode->i_mode));
2767 /* catch async errors that were recorded back when async writeback
2768 * failed for pages in this mapping. */
2769 rc = lli->lli_async_rc;
2770 lli->lli_async_rc = 0;
2771 if (lli->lli_clob != NULL) {
2772 err = lov_read_and_clear_async_rc(lli->lli_clob);
2777 /* The application has been told write failure already.
2778 * Do not report failure again. */
2779 if (fd->fd_write_failed)
2781 return rc ? -EIO : 0;
2785 * Called to make sure a portion of file has been written out.
2786 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2788 * Return how many pages have been written.
2790 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2791 enum cl_fsync_mode mode, int ignore_layout)
2793 struct cl_env_nest nest;
2796 struct obd_capa *capa = NULL;
2797 struct cl_fsync_io *fio;
2801 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2802 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2805 env = cl_env_nested_get(&nest);
2807 RETURN(PTR_ERR(env));
2809 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2811 io = ccc_env_thread_io(env);
2812 io->ci_obj = cl_i2info(inode)->lli_clob;
2813 io->ci_ignore_layout = ignore_layout;
2815 /* initialize parameters for sync */
2816 fio = &io->u.ci_fsync;
2817 fio->fi_capa = capa;
2818 fio->fi_start = start;
2820 fio->fi_fid = ll_inode2fid(inode);
2821 fio->fi_mode = mode;
2822 fio->fi_nr_written = 0;
2824 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2825 result = cl_io_loop(env, io);
2827 result = io->ci_result;
2829 result = fio->fi_nr_written;
2830 cl_io_fini(env, io);
2831 cl_env_nested_put(&nest, env);
2839 * When dentry is provided (the 'else' case), *file->f_dentry may be
2840 * null and dentry must be used directly rather than pulled from
2841 * *file->f_dentry as is done otherwise.
2844 #ifdef HAVE_FILE_FSYNC_4ARGS
2845 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2847 struct dentry *dentry = file->f_dentry;
2848 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2849 int ll_fsync(struct file *file, int datasync)
2851 struct dentry *dentry = file->f_dentry;
2853 loff_t end = LLONG_MAX;
2855 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2858 loff_t end = LLONG_MAX;
2860 struct inode *inode = dentry->d_inode;
2861 struct ll_inode_info *lli = ll_i2info(inode);
2862 struct ptlrpc_request *req;
2863 struct obd_capa *oc;
2867 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2868 PFID(ll_inode2fid(inode)), inode);
2869 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2871 #ifdef HAVE_FILE_FSYNC_4ARGS
2872 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2873 mutex_lock(&inode->i_mutex);
2875 /* fsync's caller has already called _fdata{sync,write}, we want
2876 * that IO to finish before calling the osc and mdc sync methods */
2877 rc = filemap_fdatawait(inode->i_mapping);
2880 /* catch async errors that were recorded back when async writeback
2881 * failed for pages in this mapping. */
2882 if (!S_ISDIR(inode->i_mode)) {
2883 err = lli->lli_async_rc;
2884 lli->lli_async_rc = 0;
2887 err = lov_read_and_clear_async_rc(lli->lli_clob);
2892 oc = ll_mdscapa_get(inode);
2893 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2899 ptlrpc_req_finished(req);
2901 if (S_ISREG(inode->i_mode)) {
2902 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2904 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2905 if (rc == 0 && err < 0)
2908 fd->fd_write_failed = true;
2910 fd->fd_write_failed = false;
2913 #ifdef HAVE_FILE_FSYNC_4ARGS
2914 mutex_unlock(&inode->i_mutex);
2920 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2922 struct inode *inode = file->f_dentry->d_inode;
2923 struct ll_sb_info *sbi = ll_i2sbi(inode);
2924 struct ldlm_enqueue_info einfo = {
2925 .ei_type = LDLM_FLOCK,
2926 .ei_cb_cp = ldlm_flock_completion_ast,
2927 .ei_cbdata = file_lock,
2929 struct md_op_data *op_data;
2930 struct lustre_handle lockh = {0};
2931 ldlm_policy_data_t flock = {{0}};
2932 int fl_type = file_lock->fl_type;
2938 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2939 PFID(ll_inode2fid(inode)), file_lock);
2941 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2943 if (file_lock->fl_flags & FL_FLOCK) {
2944 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2945 /* flocks are whole-file locks */
2946 flock.l_flock.end = OFFSET_MAX;
2947 /* For flocks owner is determined by the local file desctiptor*/
2948 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2949 } else if (file_lock->fl_flags & FL_POSIX) {
2950 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2951 flock.l_flock.start = file_lock->fl_start;
2952 flock.l_flock.end = file_lock->fl_end;
2956 flock.l_flock.pid = file_lock->fl_pid;
2958 /* Somewhat ugly workaround for svc lockd.
2959 * lockd installs custom fl_lmops->lm_compare_owner that checks
2960 * for the fl_owner to be the same (which it always is on local node
2961 * I guess between lockd processes) and then compares pid.
2962 * As such we assign pid to the owner field to make it all work,
2963 * conflict with normal locks is unlikely since pid space and
2964 * pointer space for current->files are not intersecting */
2965 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2966 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2970 einfo.ei_mode = LCK_PR;
2973 /* An unlock request may or may not have any relation to
2974 * existing locks so we may not be able to pass a lock handle
2975 * via a normal ldlm_lock_cancel() request. The request may even
2976 * unlock a byte range in the middle of an existing lock. In
2977 * order to process an unlock request we need all of the same
2978 * information that is given with a normal read or write record
2979 * lock request. To avoid creating another ldlm unlock (cancel)
2980 * message we'll treat a LCK_NL flock request as an unlock. */
2981 einfo.ei_mode = LCK_NL;
2984 einfo.ei_mode = LCK_PW;
2987 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3002 flags = LDLM_FL_BLOCK_NOWAIT;
3008 flags = LDLM_FL_TEST_LOCK;
3011 CERROR("unknown fcntl lock command: %d\n", cmd);
3015 /* Save the old mode so that if the mode in the lock changes we
3016 * can decrement the appropriate reader or writer refcount. */
3017 file_lock->fl_type = einfo.ei_mode;
3019 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3020 LUSTRE_OPC_ANY, NULL);
3021 if (IS_ERR(op_data))
3022 RETURN(PTR_ERR(op_data));
3024 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3025 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3026 flock.l_flock.pid, flags, einfo.ei_mode,
3027 flock.l_flock.start, flock.l_flock.end);
3029 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3032 /* Restore the file lock type if not TEST lock. */
3033 if (!(flags & LDLM_FL_TEST_LOCK))
3034 file_lock->fl_type = fl_type;
3036 if ((file_lock->fl_flags & FL_FLOCK) &&
3037 (rc == 0 || file_lock->fl_type == F_UNLCK))
3038 rc2 = flock_lock_file_wait(file, file_lock);
3039 if ((file_lock->fl_flags & FL_POSIX) &&
3040 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3041 !(flags & LDLM_FL_TEST_LOCK))
3042 rc2 = posix_lock_file_wait(file, file_lock);
3044 if (rc2 && file_lock->fl_type != F_UNLCK) {
3045 einfo.ei_mode = LCK_NL;
3046 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3051 ll_finish_md_op_data(op_data);
3056 int ll_get_fid_by_name(struct inode *parent, const char *name,
3057 int namelen, struct lu_fid *fid)
3059 struct md_op_data *op_data = NULL;
3060 struct mdt_body *body;
3061 struct ptlrpc_request *req;
3065 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3066 LUSTRE_OPC_ANY, NULL);
3067 if (IS_ERR(op_data))
3068 RETURN(PTR_ERR(op_data));
3070 op_data->op_valid = OBD_MD_FLID;
3071 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3072 ll_finish_md_op_data(op_data);
3076 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3078 GOTO(out_req, rc = -EFAULT);
3080 *fid = body->mbo_fid1;
3082 ptlrpc_req_finished(req);
3086 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3087 const char *name, int namelen)
3089 struct dentry *dchild = NULL;
3090 struct inode *child_inode = NULL;
3091 struct md_op_data *op_data;
3092 struct ptlrpc_request *request = NULL;
3097 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3098 name, PFID(ll_inode2fid(parent)), mdtidx);
3100 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3101 0, LUSTRE_OPC_ANY, NULL);
3102 if (IS_ERR(op_data))
3103 RETURN(PTR_ERR(op_data));
3105 /* Get child FID first */
3106 qstr.hash = full_name_hash(name, namelen);
3109 dchild = d_lookup(file->f_dentry, &qstr);
3110 if (dchild != NULL && dchild->d_inode != NULL) {
3111 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3112 if (dchild->d_inode != NULL) {
3113 child_inode = igrab(dchild->d_inode);
3114 ll_invalidate_aliases(child_inode);
3118 rc = ll_get_fid_by_name(parent, name, namelen,
3124 if (!fid_is_sane(&op_data->op_fid3)) {
3125 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3126 ll_get_fsname(parent->i_sb, NULL, 0), name,
3127 PFID(&op_data->op_fid3));
3128 GOTO(out_free, rc = -EINVAL);
3131 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3136 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3137 PFID(&op_data->op_fid3), mdtidx);
3138 GOTO(out_free, rc = 0);
3141 op_data->op_mds = mdtidx;
3142 op_data->op_cli_flags = CLI_MIGRATE;
3143 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3144 namelen, name, namelen, &request);
3146 ll_update_times(request, parent);
3148 ptlrpc_req_finished(request);
3153 if (child_inode != NULL) {
3154 clear_nlink(child_inode);
3158 ll_finish_md_op_data(op_data);
3163 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3171 * test if some locks matching bits and l_req_mode are acquired
3172 * - bits can be in different locks
3173 * - if found clear the common lock bits in *bits
3174 * - the bits not found, are kept in *bits
3176 * \param bits [IN] searched lock bits [IN]
3177 * \param l_req_mode [IN] searched lock mode
3178 * \retval boolean, true iff all bits are found
3180 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3182 struct lustre_handle lockh;
3183 ldlm_policy_data_t policy;
3184 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3185 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3194 fid = &ll_i2info(inode)->lli_fid;
3195 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3196 ldlm_lockname[mode]);
3198 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3199 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3200 policy.l_inodebits.bits = *bits & (1 << i);
3201 if (policy.l_inodebits.bits == 0)
3204 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3205 &policy, mode, &lockh)) {
3206 struct ldlm_lock *lock;
3208 lock = ldlm_handle2lock(&lockh);
3211 ~(lock->l_policy_data.l_inodebits.bits);
3212 LDLM_LOCK_PUT(lock);
3214 *bits &= ~policy.l_inodebits.bits;
3221 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3222 struct lustre_handle *lockh, __u64 flags,
3225 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3230 fid = &ll_i2info(inode)->lli_fid;
3231 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3233 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3234 fid, LDLM_IBITS, &policy, mode, lockh);
3239 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3241 /* Already unlinked. Just update nlink and return success */
3242 if (rc == -ENOENT) {
3244 /* This path cannot be hit for regular files unless in
3245 * case of obscure races, so no need to to validate
3247 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3249 } else if (rc != 0) {
3250 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3251 "%s: revalidate FID "DFID" error: rc = %d\n",
3252 ll_get_fsname(inode->i_sb, NULL, 0),
3253 PFID(ll_inode2fid(inode)), rc);
3259 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3261 struct inode *inode = dentry->d_inode;
3262 struct ptlrpc_request *req = NULL;
3263 struct obd_export *exp;
3267 LASSERT(inode != NULL);
3269 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3270 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3272 exp = ll_i2mdexp(inode);
3274 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3275 * But under CMD case, it caused some lock issues, should be fixed
3276 * with new CMD ibits lock. See bug 12718 */
3277 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3278 struct lookup_intent oit = { .it_op = IT_GETATTR };
3279 struct md_op_data *op_data;
3281 if (ibits == MDS_INODELOCK_LOOKUP)
3282 oit.it_op = IT_LOOKUP;
3284 /* Call getattr by fid, so do not provide name at all. */
3285 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3286 dentry->d_inode, NULL, 0, 0,
3287 LUSTRE_OPC_ANY, NULL);
3288 if (IS_ERR(op_data))
3289 RETURN(PTR_ERR(op_data));
3291 rc = md_intent_lock(exp, op_data, &oit, &req,
3292 &ll_md_blocking_ast, 0);
3293 ll_finish_md_op_data(op_data);
3295 rc = ll_inode_revalidate_fini(inode, rc);
3299 rc = ll_revalidate_it_finish(req, &oit, dentry);
3301 ll_intent_release(&oit);
3305 /* Unlinked? Unhash dentry, so it is not picked up later by
3306 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3307 here to preserve get_cwd functionality on 2.6.
3309 if (!dentry->d_inode->i_nlink)
3310 d_lustre_invalidate(dentry, 0);
3312 ll_lookup_finish_locks(&oit, dentry);
3313 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3314 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3315 obd_valid valid = OBD_MD_FLGETATTR;
3316 struct md_op_data *op_data;
3319 if (S_ISREG(inode->i_mode)) {
3320 rc = ll_get_default_mdsize(sbi, &ealen);
3323 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3326 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3327 0, ealen, LUSTRE_OPC_ANY,
3329 if (IS_ERR(op_data))
3330 RETURN(PTR_ERR(op_data));
3332 op_data->op_valid = valid;
3333 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3334 * capa for this inode. Because we only keep capas of dirs
3336 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3337 ll_finish_md_op_data(op_data);
3339 rc = ll_inode_revalidate_fini(inode, rc);
3343 rc = ll_prep_inode(&inode, req, NULL, NULL);
3346 ptlrpc_req_finished(req);
3350 static int ll_merge_md_attr(struct inode *inode)
3352 struct cl_attr attr = { 0 };
3355 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3356 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3361 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3362 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3364 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3365 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3366 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3372 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3374 struct inode *inode = dentry->d_inode;
3378 rc = __ll_inode_revalidate(dentry, ibits);
3382 /* if object isn't regular file, don't validate size */
3383 if (!S_ISREG(inode->i_mode)) {
3384 if (S_ISDIR(inode->i_mode) &&
3385 ll_i2info(inode)->lli_lsm_md != NULL) {
3386 rc = ll_merge_md_attr(inode);
3391 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3392 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3393 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3395 /* In case of restore, the MDT has the right size and has
3396 * already send it back without granting the layout lock,
3397 * inode is up-to-date so glimpse is useless.
3398 * Also to glimpse we need the layout, in case of a running
3399 * restore the MDT holds the layout lock so the glimpse will
3400 * block up to the end of restore (getattr will block)
3402 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3403 rc = ll_glimpse_size(inode);
3408 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3410 struct inode *inode = de->d_inode;
3411 struct ll_sb_info *sbi = ll_i2sbi(inode);
3412 struct ll_inode_info *lli = ll_i2info(inode);
3415 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3416 MDS_INODELOCK_LOOKUP);
3417 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3422 stat->dev = inode->i_sb->s_dev;
3423 if (ll_need_32bit_api(sbi))
3424 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3426 stat->ino = inode->i_ino;
3427 stat->mode = inode->i_mode;
3428 stat->uid = inode->i_uid;
3429 stat->gid = inode->i_gid;
3430 stat->rdev = inode->i_rdev;
3431 stat->atime = inode->i_atime;
3432 stat->mtime = inode->i_mtime;
3433 stat->ctime = inode->i_ctime;
3434 stat->blksize = 1 << inode->i_blkbits;
3435 stat->blocks = inode->i_blocks;
3437 if (S_ISDIR(inode->i_mode) &&
3438 ll_i2info(inode)->lli_lsm_md != NULL) {
3439 stat->nlink = lli->lli_stripe_dir_nlink;
3440 stat->size = lli->lli_stripe_dir_size;
3442 stat->nlink = inode->i_nlink;
3443 stat->size = i_size_read(inode);
3449 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3450 __u64 start, __u64 len)
3454 struct ll_user_fiemap *fiemap;
3455 unsigned int extent_count = fieinfo->fi_extents_max;
3457 num_bytes = sizeof(*fiemap) + (extent_count *
3458 sizeof(struct ll_fiemap_extent));
3459 OBD_ALLOC_LARGE(fiemap, num_bytes);
3464 fiemap->fm_flags = fieinfo->fi_flags;
3465 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3466 fiemap->fm_start = start;
3467 fiemap->fm_length = len;
3468 if (extent_count > 0)
3469 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3470 sizeof(struct ll_fiemap_extent));
3472 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3474 fieinfo->fi_flags = fiemap->fm_flags;
3475 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3476 if (extent_count > 0)
3477 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3478 fiemap->fm_mapped_extents *
3479 sizeof(struct ll_fiemap_extent));
3481 OBD_FREE_LARGE(fiemap, num_bytes);
3485 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3487 struct ll_inode_info *lli = ll_i2info(inode);
3488 struct posix_acl *acl = NULL;
3491 spin_lock(&lli->lli_lock);
3492 /* VFS' acl_permission_check->check_acl will release the refcount */
3493 acl = posix_acl_dup(lli->lli_posix_acl);
3494 spin_unlock(&lli->lli_lock);
3499 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3501 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3502 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3504 ll_check_acl(struct inode *inode, int mask)
3507 # ifdef CONFIG_FS_POSIX_ACL
3508 struct posix_acl *acl;
3512 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3513 if (flags & IPERM_FLAG_RCU)
3516 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3521 rc = posix_acl_permission(inode, acl, mask);
3522 posix_acl_release(acl);
3525 # else /* !CONFIG_FS_POSIX_ACL */
3527 # endif /* CONFIG_FS_POSIX_ACL */
3529 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3531 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3532 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3534 # ifdef HAVE_INODE_PERMISION_2ARGS
3535 int ll_inode_permission(struct inode *inode, int mask)
3537 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3542 struct ll_sb_info *sbi;
3543 struct root_squash_info *squash;
3544 struct cred *cred = NULL;
3545 const struct cred *old_cred = NULL;
3547 bool squash_id = false;
3550 #ifdef MAY_NOT_BLOCK
3551 if (mask & MAY_NOT_BLOCK)
3553 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3554 if (flags & IPERM_FLAG_RCU)
3558 /* as root inode are NOT getting validated in lookup operation,
3559 * need to do it before permission check. */
3561 if (inode == inode->i_sb->s_root->d_inode) {
3562 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3563 MDS_INODELOCK_LOOKUP);
3568 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3569 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3571 /* squash fsuid/fsgid if needed */
3572 sbi = ll_i2sbi(inode);
3573 squash = &sbi->ll_squash;
3574 if (unlikely(squash->rsi_uid != 0 &&
3575 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3576 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3580 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3581 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3582 squash->rsi_uid, squash->rsi_gid);
3584 /* update current process's credentials
3585 * and FS capability */
3586 cred = prepare_creds();
3590 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3591 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3592 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3593 if ((1 << cap) & CFS_CAP_FS_MASK)
3594 cap_lower(cred->cap_effective, cap);
3596 old_cred = override_creds(cred);
3599 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3601 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3602 rc = lustre_check_remote_perm(inode, mask);
3604 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3606 /* restore current process's credentials and FS capability */
3608 revert_creds(old_cred);
3615 /* -o localflock - only provides locally consistent flock locks */
3616 struct file_operations ll_file_operations = {
3617 .read = ll_file_read,
3618 .aio_read = ll_file_aio_read,
3619 .write = ll_file_write,
3620 .aio_write = ll_file_aio_write,
3621 .unlocked_ioctl = ll_file_ioctl,
3622 .open = ll_file_open,
3623 .release = ll_file_release,
3624 .mmap = ll_file_mmap,
3625 .llseek = ll_file_seek,
3626 .splice_read = ll_file_splice_read,
3631 struct file_operations ll_file_operations_flock = {
3632 .read = ll_file_read,
3633 .aio_read = ll_file_aio_read,
3634 .write = ll_file_write,
3635 .aio_write = ll_file_aio_write,
3636 .unlocked_ioctl = ll_file_ioctl,
3637 .open = ll_file_open,
3638 .release = ll_file_release,
3639 .mmap = ll_file_mmap,
3640 .llseek = ll_file_seek,
3641 .splice_read = ll_file_splice_read,
3644 .flock = ll_file_flock,
3645 .lock = ll_file_flock
3648 /* These are for -o noflock - to return ENOSYS on flock calls */
3649 struct file_operations ll_file_operations_noflock = {
3650 .read = ll_file_read,
3651 .aio_read = ll_file_aio_read,
3652 .write = ll_file_write,
3653 .aio_write = ll_file_aio_write,
3654 .unlocked_ioctl = ll_file_ioctl,
3655 .open = ll_file_open,
3656 .release = ll_file_release,
3657 .mmap = ll_file_mmap,
3658 .llseek = ll_file_seek,
3659 .splice_read = ll_file_splice_read,
3662 .flock = ll_file_noflock,
3663 .lock = ll_file_noflock
3666 struct inode_operations ll_file_inode_operations = {
3667 .setattr = ll_setattr,
3668 .getattr = ll_getattr,
3669 .permission = ll_inode_permission,
3670 .setxattr = ll_setxattr,
3671 .getxattr = ll_getxattr,
3672 .listxattr = ll_listxattr,
3673 .removexattr = ll_removexattr,
3674 .fiemap = ll_fiemap,
3675 #ifdef HAVE_IOP_GET_ACL
3676 .get_acl = ll_get_acl,
3680 /* dynamic ioctl number support routins */
3681 static struct llioc_ctl_data {
3682 struct rw_semaphore ioc_sem;
3683 struct list_head ioc_head;
3685 __RWSEM_INITIALIZER(llioc.ioc_sem),
3686 LIST_HEAD_INIT(llioc.ioc_head)
3691 struct list_head iocd_list;
3692 unsigned int iocd_size;
3693 llioc_callback_t iocd_cb;
3694 unsigned int iocd_count;
3695 unsigned int iocd_cmd[0];
3698 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3701 struct llioc_data *in_data = NULL;
3704 if (cb == NULL || cmd == NULL ||
3705 count > LLIOC_MAX_CMD || count < 0)
3708 size = sizeof(*in_data) + count * sizeof(unsigned int);
3709 OBD_ALLOC(in_data, size);
3710 if (in_data == NULL)
3713 memset(in_data, 0, sizeof(*in_data));
3714 in_data->iocd_size = size;
3715 in_data->iocd_cb = cb;
3716 in_data->iocd_count = count;
3717 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3719 down_write(&llioc.ioc_sem);
3720 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3721 up_write(&llioc.ioc_sem);
3726 void ll_iocontrol_unregister(void *magic)
3728 struct llioc_data *tmp;
3733 down_write(&llioc.ioc_sem);
3734 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3736 unsigned int size = tmp->iocd_size;
3738 list_del(&tmp->iocd_list);
3739 up_write(&llioc.ioc_sem);
3741 OBD_FREE(tmp, size);
3745 up_write(&llioc.ioc_sem);
3747 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3750 EXPORT_SYMBOL(ll_iocontrol_register);
3751 EXPORT_SYMBOL(ll_iocontrol_unregister);
3753 static enum llioc_iter
3754 ll_iocontrol_call(struct inode *inode, struct file *file,
3755 unsigned int cmd, unsigned long arg, int *rcp)
3757 enum llioc_iter ret = LLIOC_CONT;
3758 struct llioc_data *data;
3759 int rc = -EINVAL, i;
3761 down_read(&llioc.ioc_sem);
3762 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3763 for (i = 0; i < data->iocd_count; i++) {
3764 if (cmd != data->iocd_cmd[i])
3767 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3771 if (ret == LLIOC_STOP)
3774 up_read(&llioc.ioc_sem);
3781 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3783 struct ll_inode_info *lli = ll_i2info(inode);
3784 struct cl_env_nest nest;
3789 if (lli->lli_clob == NULL)
3792 env = cl_env_nested_get(&nest);
3794 RETURN(PTR_ERR(env));
3796 result = cl_conf_set(env, lli->lli_clob, conf);
3797 cl_env_nested_put(&nest, env);
3799 if (conf->coc_opc == OBJECT_CONF_SET) {
3800 struct ldlm_lock *lock = conf->coc_lock;
3802 LASSERT(lock != NULL);
3803 LASSERT(ldlm_has_layout(lock));
3805 struct lustre_md *md = conf->u.coc_md;
3806 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3808 /* it can only be allowed to match after layout is
3809 * applied to inode otherwise false layout would be
3810 * seen. Applying layout shoud happen before dropping
3811 * the intent lock. */
3812 ldlm_lock_allow_match(lock);
3814 lli->lli_has_smd = lsm_has_objects(md->lsm);
3815 if (md->lsm != NULL)
3816 gen = md->lsm->lsm_layout_gen;
3819 DFID ": layout version change: %u -> %u\n",
3820 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3822 ll_layout_version_set(lli, gen);
3828 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3829 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3832 struct ll_sb_info *sbi = ll_i2sbi(inode);
3833 struct obd_capa *oc;
3834 struct ptlrpc_request *req;
3835 struct mdt_body *body;
3842 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3843 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3844 lock->l_lvb_data, lock->l_lvb_len);
3846 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3849 /* if layout lock was granted right away, the layout is returned
3850 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3851 * blocked and then granted via completion ast, we have to fetch
3852 * layout here. Please note that we can't use the LVB buffer in
3853 * completion AST because it doesn't have a large enough buffer */
3854 oc = ll_mdscapa_get(inode);
3855 rc = ll_get_default_mdsize(sbi, &lmmsize);
3857 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3858 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3864 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3866 GOTO(out, rc = -EPROTO);
3868 lmmsize = body->mbo_eadatasize;
3869 if (lmmsize == 0) /* empty layout */
3872 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3874 GOTO(out, rc = -EFAULT);
3876 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3877 if (lvbdata == NULL)
3878 GOTO(out, rc = -ENOMEM);
3880 memcpy(lvbdata, lmm, lmmsize);
3881 lock_res_and_lock(lock);
3882 if (lock->l_lvb_data != NULL)
3883 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3885 lock->l_lvb_data = lvbdata;
3886 lock->l_lvb_len = lmmsize;
3887 unlock_res_and_lock(lock);
3892 ptlrpc_req_finished(req);
3897 * Apply the layout to the inode. Layout lock is held and will be released
3900 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3901 struct inode *inode, __u32 *gen, bool reconf)
3903 struct ll_inode_info *lli = ll_i2info(inode);
3904 struct ll_sb_info *sbi = ll_i2sbi(inode);
3905 struct ldlm_lock *lock;
3906 struct lustre_md md = { NULL };
3907 struct cl_object_conf conf;
3910 bool wait_layout = false;
3913 LASSERT(lustre_handle_is_used(lockh));
3915 lock = ldlm_handle2lock(lockh);
3916 LASSERT(lock != NULL);
3917 LASSERT(ldlm_has_layout(lock));
3919 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3920 PFID(&lli->lli_fid), inode, reconf);
3922 /* in case this is a caching lock and reinstate with new inode */
3923 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3925 lock_res_and_lock(lock);
3926 lvb_ready = ldlm_is_lvb_ready(lock);
3927 unlock_res_and_lock(lock);
3928 /* checking lvb_ready is racy but this is okay. The worst case is
3929 * that multi processes may configure the file on the same time. */
3931 if (lvb_ready || !reconf) {
3934 /* layout_gen must be valid if layout lock is not
3935 * cancelled and stripe has already set */
3936 *gen = ll_layout_version_get(lli);
3942 rc = ll_layout_fetch(inode, lock);
3946 /* for layout lock, lmm is returned in lock's lvb.
3947 * lvb_data is immutable if the lock is held so it's safe to access it
3948 * without res lock. See the description in ldlm_lock_decref_internal()
3949 * for the condition to free lvb_data of layout lock */
3950 if (lock->l_lvb_data != NULL) {
3951 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3952 lock->l_lvb_data, lock->l_lvb_len);
3954 *gen = LL_LAYOUT_GEN_EMPTY;
3956 *gen = md.lsm->lsm_layout_gen;
3959 CERROR("%s: file "DFID" unpackmd error: %d\n",
3960 ll_get_fsname(inode->i_sb, NULL, 0),
3961 PFID(&lli->lli_fid), rc);
3967 /* set layout to file. Unlikely this will fail as old layout was
3968 * surely eliminated */
3969 memset(&conf, 0, sizeof conf);
3970 conf.coc_opc = OBJECT_CONF_SET;
3971 conf.coc_inode = inode;
3972 conf.coc_lock = lock;
3973 conf.u.coc_md = &md;
3974 rc = ll_layout_conf(inode, &conf);
3977 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3979 /* refresh layout failed, need to wait */
3980 wait_layout = rc == -EBUSY;
3984 LDLM_LOCK_PUT(lock);
3985 ldlm_lock_decref(lockh, mode);
3987 /* wait for IO to complete if it's still being used. */
3989 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3990 ll_get_fsname(inode->i_sb, NULL, 0),
3991 PFID(&lli->lli_fid), inode);
3993 memset(&conf, 0, sizeof conf);
3994 conf.coc_opc = OBJECT_CONF_WAIT;
3995 conf.coc_inode = inode;
3996 rc = ll_layout_conf(inode, &conf);
4000 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4001 ll_get_fsname(inode->i_sb, NULL, 0),
4002 PFID(&lli->lli_fid), rc);
4008 * This function checks if there exists a LAYOUT lock on the client side,
4009 * or enqueues it if it doesn't have one in cache.
4011 * This function will not hold layout lock so it may be revoked any time after
4012 * this function returns. Any operations depend on layout should be redone
4015 * This function should be called before lov_io_init() to get an uptodate
4016 * layout version, the caller should save the version number and after IO
4017 * is finished, this function should be called again to verify that layout
4018 * is not changed during IO time.
4020 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4022 struct ll_inode_info *lli = ll_i2info(inode);
4023 struct ll_sb_info *sbi = ll_i2sbi(inode);
4024 struct md_op_data *op_data;
4025 struct lookup_intent it;
4026 struct lustre_handle lockh;
4028 struct ldlm_enqueue_info einfo = {
4029 .ei_type = LDLM_IBITS,
4031 .ei_cb_bl = &ll_md_blocking_ast,
4032 .ei_cb_cp = &ldlm_completion_ast,
4037 *gen = ll_layout_version_get(lli);
4038 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4042 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4043 LASSERT(S_ISREG(inode->i_mode));
4045 /* take layout lock mutex to enqueue layout lock exclusively. */
4046 mutex_lock(&lli->lli_layout_mutex);
4049 /* mostly layout lock is caching on the local side, so try to match
4050 * it before grabbing layout lock mutex. */
4051 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4052 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4053 if (mode != 0) { /* hit cached lock */
4054 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4058 mutex_unlock(&lli->lli_layout_mutex);
4062 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4063 0, 0, LUSTRE_OPC_ANY, NULL);
4064 if (IS_ERR(op_data)) {
4065 mutex_unlock(&lli->lli_layout_mutex);
4066 RETURN(PTR_ERR(op_data));
4069 /* have to enqueue one */
4070 memset(&it, 0, sizeof(it));
4071 it.it_op = IT_LAYOUT;
4072 lockh.cookie = 0ULL;
4074 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4075 ll_get_fsname(inode->i_sb, NULL, 0),
4076 PFID(&lli->lli_fid), inode);
4078 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4079 if (it.d.lustre.it_data != NULL)
4080 ptlrpc_req_finished(it.d.lustre.it_data);
4081 it.d.lustre.it_data = NULL;
4083 ll_finish_md_op_data(op_data);
4085 mode = it.d.lustre.it_lock_mode;
4086 it.d.lustre.it_lock_mode = 0;
4087 ll_intent_drop_lock(&it);
4090 /* set lock data in case this is a new lock */
4091 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4092 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4096 mutex_unlock(&lli->lli_layout_mutex);
4102 * This function send a restore request to the MDT
4104 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4106 struct hsm_user_request *hur;
4110 len = sizeof(struct hsm_user_request) +
4111 sizeof(struct hsm_user_item);
4112 OBD_ALLOC(hur, len);
4116 hur->hur_request.hr_action = HUA_RESTORE;
4117 hur->hur_request.hr_archive_id = 0;
4118 hur->hur_request.hr_flags = 0;
4119 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4120 sizeof(hur->hur_user_item[0].hui_fid));
4121 hur->hur_user_item[0].hui_extent.offset = offset;
4122 hur->hur_user_item[0].hui_extent.length = length;
4123 hur->hur_request.hr_itemcount = 1;
4124 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,