1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 #ifdef CONFIG_FS_POSIX_ACL
290 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
291 inode == inode->i_sb->s_root->d_inode) {
292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
295 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
296 fd->fd_flags &= ~LL_FILE_RMTACL;
297 rct_del(&sbi->ll_rct, cfs_curproc_pid());
298 et_search_free(&sbi->ll_et, cfs_curproc_pid());
303 if (inode->i_sb->s_root != file->f_dentry)
304 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
305 fd = LUSTRE_FPRIVATE(file);
308 /* The last ref on @file, maybe not the the owner pid of statahead.
309 * Different processes can open the same dir, "ll_opendir_key" means:
310 * it is me that should stop the statahead thread. */
311 if (lli->lli_opendir_key == fd)
312 ll_stop_statahead(inode, fd);
314 if (inode->i_sb->s_root == file->f_dentry) {
315 LUSTRE_FPRIVATE(file) = NULL;
316 ll_file_data_put(fd);
321 lov_test_and_clear_async_rc(lsm);
322 lli->lli_async_rc = 0;
324 rc = ll_md_close(sbi->ll_md_exp, inode, file);
328 static int ll_intent_file_open(struct file *file, void *lmm,
329 int lmmsize, struct lookup_intent *itp)
331 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
332 struct dentry *parent = file->f_dentry->d_parent;
333 const char *name = file->f_dentry->d_name.name;
334 const int len = file->f_dentry->d_name.len;
335 struct md_op_data *op_data;
336 struct ptlrpc_request *req;
343 /* Usually we come here only for NFSD, and we want open lock.
344 But we can also get here with pre 2.6.15 patchless kernels, and in
345 that case that lock is also ok */
346 /* We can also get here if there was cached open handle in revalidate_it
347 * but it disappeared while we were getting from there to ll_file_open.
348 * But this means this file was closed and immediatelly opened which
349 * makes a good candidate for using OPEN lock */
350 /* If lmmsize & lmm are not 0, we are just setting stripe info
351 * parameters. No need for the open lock */
352 if (!lmm && !lmmsize)
353 itp->it_flags |= MDS_OPEN_LOCK;
355 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
356 file->f_dentry->d_inode, name, len,
357 O_RDWR, LUSTRE_OPC_ANY, NULL);
359 RETURN(PTR_ERR(op_data));
361 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
362 0 /*unused */, &req, ll_md_blocking_ast, 0);
363 ll_finish_md_op_data(op_data);
365 /* reason for keep own exit path - don`t flood log
366 * with messages with -ESTALE errors.
368 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
369 it_open_error(DISP_OPEN_OPEN, itp))
371 ll_release_openhandle(file->f_dentry, itp);
375 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
376 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
377 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
381 if (itp->d.lustre.it_lock_mode)
382 md_set_lock_data(sbi->ll_md_exp,
383 &itp->d.lustre.it_lock_handle,
384 file->f_dentry->d_inode);
386 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
388 ptlrpc_req_finished(itp->d.lustre.it_data);
391 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
392 ll_intent_drop_lock(itp);
397 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
398 struct lookup_intent *it, struct obd_client_handle *och)
400 struct ptlrpc_request *req = it->d.lustre.it_data;
401 struct mdt_body *body;
405 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
406 LASSERT(body != NULL); /* reply already checked out */
408 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
409 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
410 och->och_fid = lli->lli_fid;
411 och->och_flags = it->it_flags;
412 lli->lli_ioepoch = body->ioepoch;
414 return md_set_open_replay_data(md_exp, och, req);
417 int ll_local_open(struct file *file, struct lookup_intent *it,
418 struct ll_file_data *fd, struct obd_client_handle *och)
420 struct inode *inode = file->f_dentry->d_inode;
421 struct ll_inode_info *lli = ll_i2info(inode);
424 LASSERT(!LUSTRE_FPRIVATE(file));
429 struct ptlrpc_request *req = it->d.lustre.it_data;
430 struct mdt_body *body;
433 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
438 if ((it->it_flags & FMODE_WRITE) &&
439 (body->valid & OBD_MD_FLSIZE))
440 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
441 lli->lli_ioepoch, PFID(&lli->lli_fid));
444 LUSTRE_FPRIVATE(file) = fd;
445 ll_readahead_init(inode, &fd->fd_ras);
446 fd->fd_omode = it->it_flags;
450 /* Open a file, and (for the very first open) create objects on the OSTs at
451 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
452 * creation or open until ll_lov_setstripe() ioctl is called. We grab
453 * lli_open_sem to ensure no other process will create objects, send the
454 * stripe MD to the MDS, or try to destroy the objects if that fails.
456 * If we already have the stripe MD locally then we don't request it in
457 * md_open(), by passing a lmm_size = 0.
459 * It is up to the application to ensure no other processes open this file
460 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
461 * used. We might be able to avoid races of that sort by getting lli_open_sem
462 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
463 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
465 int ll_file_open(struct inode *inode, struct file *file)
467 struct ll_inode_info *lli = ll_i2info(inode);
468 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
469 .it_flags = file->f_flags };
470 struct lov_stripe_md *lsm;
471 struct ptlrpc_request *req = NULL;
472 struct obd_client_handle **och_p;
474 struct ll_file_data *fd;
475 int rc = 0, opendir_set = 0;
478 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
479 inode->i_generation, inode, file->f_flags);
481 #ifdef HAVE_VFS_INTENT_PATCHES
484 it = file->private_data; /* XXX: compat macro */
485 file->private_data = NULL; /* prevent ll_local_open assertion */
488 fd = ll_file_data_get();
492 if (S_ISDIR(inode->i_mode)) {
493 spin_lock(&lli->lli_lock);
494 /* "lli->lli_opendir_pid != 0" means someone has set it.
495 * "lli->lli_sai != NULL" means the previous statahead has not
497 if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
499 lli->lli_opendir_pid = cfs_curproc_pid();
500 lli->lli_opendir_key = fd;
501 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
502 /* Two cases for this:
503 * (1) The same process open such directory many times.
504 * (2) The old process opened the directory, and exited
505 * before its children processes. Then new process
506 * with the same pid opens such directory before the
507 * old process's children processes exit.
508 * Change the owner to the latest one. */
510 lli->lli_opendir_key = fd;
512 spin_unlock(&lli->lli_lock);
515 if (inode->i_sb->s_root == file->f_dentry) {
516 LUSTRE_FPRIVATE(file) = fd;
520 if (!it || !it->d.lustre.it_disposition) {
521 /* Convert f_flags into access mode. We cannot use file->f_mode,
522 * because everything but O_ACCMODE mask was stripped from
524 if ((oit.it_flags + 1) & O_ACCMODE)
526 if (file->f_flags & O_TRUNC)
527 oit.it_flags |= FMODE_WRITE;
529 /* kernel only call f_op->open in dentry_open. filp_open calls
530 * dentry_open after call to open_namei that checks permissions.
531 * Only nfsd_open call dentry_open directly without checking
532 * permissions and because of that this code below is safe. */
533 if (oit.it_flags & FMODE_WRITE)
534 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
536 /* We do not want O_EXCL here, presumably we opened the file
537 * already? XXX - NFS implications? */
538 oit.it_flags &= ~O_EXCL;
544 /* Let's see if we have file open on MDS already. */
545 if (it->it_flags & FMODE_WRITE) {
546 och_p = &lli->lli_mds_write_och;
547 och_usecount = &lli->lli_open_fd_write_count;
548 } else if (it->it_flags & FMODE_EXEC) {
549 och_p = &lli->lli_mds_exec_och;
550 och_usecount = &lli->lli_open_fd_exec_count;
552 och_p = &lli->lli_mds_read_och;
553 och_usecount = &lli->lli_open_fd_read_count;
556 down(&lli->lli_och_sem);
557 if (*och_p) { /* Open handle is present */
558 if (it_disposition(it, DISP_OPEN_OPEN)) {
559 /* Well, there's extra open request that we do not need,
560 let's close it somehow. This will decref request. */
561 rc = it_open_error(DISP_OPEN_OPEN, it);
563 ll_file_data_put(fd);
564 GOTO(out_och_free, rc);
566 ll_release_openhandle(file->f_dentry, it);
567 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
572 rc = ll_local_open(file, it, fd, NULL);
574 up(&lli->lli_och_sem);
575 ll_file_data_put(fd);
579 LASSERT(*och_usecount == 0);
580 if (!it->d.lustre.it_disposition) {
581 /* We cannot just request lock handle now, new ELC code
582 means that one of other OPEN locks for this file
583 could be cancelled, and since blocking ast handler
584 would attempt to grab och_sem as well, that would
585 result in a deadlock */
586 up(&lli->lli_och_sem);
587 it->it_flags |= O_CHECK_STALE;
588 rc = ll_intent_file_open(file, NULL, 0, it);
589 it->it_flags &= ~O_CHECK_STALE;
591 ll_file_data_put(fd);
592 GOTO(out_openerr, rc);
595 /* Got some error? Release the request */
596 if (it->d.lustre.it_status < 0) {
597 req = it->d.lustre.it_data;
598 ptlrpc_req_finished(req);
600 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
601 &it->d.lustre.it_lock_handle,
602 file->f_dentry->d_inode);
605 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
607 ll_file_data_put(fd);
608 GOTO(out_och_free, rc = -ENOMEM);
611 req = it->d.lustre.it_data;
613 /* md_intent_lock() didn't get a request ref if there was an
614 * open error, so don't do cleanup on the request here
616 /* XXX (green): Should not we bail out on any error here, not
617 * just open error? */
618 rc = it_open_error(DISP_OPEN_OPEN, it);
620 ll_file_data_put(fd);
621 GOTO(out_och_free, rc);
624 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
625 rc = ll_local_open(file, it, fd, *och_p);
627 up(&lli->lli_och_sem);
628 ll_file_data_put(fd);
629 GOTO(out_och_free, rc);
632 up(&lli->lli_och_sem);
634 /* Must do this outside lli_och_sem lock to prevent deadlock where
635 different kind of OPEN lock for this same inode gets cancelled
636 by ldlm_cancel_lru */
637 if (!S_ISREG(inode->i_mode))
644 if (file->f_flags & O_LOV_DELAY_CREATE ||
645 !(file->f_mode & FMODE_WRITE)) {
646 CDEBUG(D_INODE, "object creation was delayed\n");
650 file->f_flags &= ~O_LOV_DELAY_CREATE;
653 ptlrpc_req_finished(req);
655 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
660 *och_p = NULL; /* OBD_FREE writes some magic there */
663 up(&lli->lli_och_sem);
665 if (opendir_set == 1) {
666 lli->lli_opendir_key = NULL;
667 lli->lli_opendir_pid = 0;
668 } else if (unlikely(opendir_set == 2)) {
669 ll_stop_statahead(inode, fd);
676 /* Fills the obdo with the attributes for the inode defined by lsm */
677 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
679 struct ptlrpc_request_set *set;
680 struct ll_inode_info *lli = ll_i2info(inode);
681 struct lov_stripe_md *lsm = lli->lli_smd;
683 struct obd_info oinfo = { { { 0 } } };
687 LASSERT(lsm != NULL);
691 oinfo.oi_oa->o_id = lsm->lsm_object_id;
692 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
693 oinfo.oi_oa->o_mode = S_IFREG;
694 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
695 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
696 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
697 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
699 oinfo.oi_capa = ll_mdscapa_get(inode);
701 set = ptlrpc_prep_set();
703 CERROR("can't allocate ptlrpc set\n");
706 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
708 rc = ptlrpc_set_wait(set);
709 ptlrpc_set_destroy(set);
711 capa_put(oinfo.oi_capa);
715 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
716 OBD_MD_FLATIME | OBD_MD_FLMTIME |
717 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
719 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
720 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
721 lli->lli_smd->lsm_object_id, i_size_read(inode),
722 (unsigned long long)inode->i_blocks,
723 (unsigned long)ll_inode_blksize(inode));
727 static inline void ll_remove_suid(struct inode *inode)
731 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
732 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
734 /* was any of the uid bits set? */
735 mode &= inode->i_mode;
736 if (mode && !capable(CAP_FSETID)) {
737 inode->i_mode &= ~mode;
738 // XXX careful here - we cannot change the size
742 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
744 struct ll_inode_info *lli = ll_i2info(inode);
745 struct lov_stripe_md *lsm = lli->lli_smd;
746 struct obd_export *exp = ll_i2dtexp(inode);
749 struct ldlm_lock *lock;
750 struct lov_stripe_md *lsm;
751 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
752 __u32 stripe, vallen = sizeof(stripe);
756 if (lsm->lsm_stripe_count == 1)
757 GOTO(check, stripe = 0);
759 /* get our offset in the lov */
760 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
762 CERROR("obd_get_info: rc = %d\n", rc);
765 LASSERT(stripe < lsm->lsm_stripe_count);
768 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
769 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
770 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
771 lsm->lsm_oinfo[stripe]->loi_id,
772 lsm->lsm_oinfo[stripe]->loi_gr);
773 RETURN(-ELDLM_NO_LOCK_DATA);
779 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
780 * we get a lock cancellation for each stripe, so we have to map the obd's
781 * region back onto the stripes in the file that it held.
783 * No one can dirty the extent until we've finished our work and they can
784 * enqueue another lock. The DLM protects us from ll_file_read/write here,
785 * but other kernel actors could have pages locked.
787 * Called with the DLM lock held. */
788 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
789 struct ldlm_lock *lock, __u32 stripe)
791 ldlm_policy_data_t tmpex;
792 unsigned long start, end, count, skip, i, j;
794 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
795 struct lustre_handle lockh;
796 struct address_space *mapping = inode->i_mapping;
799 tmpex = lock->l_policy_data;
800 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
801 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
804 /* our locks are page granular thanks to osc_enqueue, we invalidate the
806 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
807 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
808 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
810 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
811 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
815 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
816 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
817 if (lsm->lsm_stripe_count > 1) {
818 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
819 skip = (lsm->lsm_stripe_count - 1) * count;
820 start += start/count * skip + stripe * count;
822 end += end/count * skip + stripe * count;
824 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
827 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
832 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
833 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
834 count, skip, end, discard ? " (DISCARDING)" : "");
836 /* walk through the vmas on the inode and tear down mmaped pages that
837 * intersect with the lock. this stops immediately if there are no
838 * mmap()ed regions of the file. This is not efficient at all and
839 * should be short lived. We'll associate mmap()ed pages with the lock
840 * and will be able to find them directly */
841 for (i = start; i <= end; i += (j + skip)) {
842 j = min(count - (i % count), end - i + 1);
845 if (ll_teardown_mmaps(mapping,
846 (__u64)i << CFS_PAGE_SHIFT,
847 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
851 /* this is the simplistic implementation of page eviction at
852 * cancelation. It is careful to get races with other page
853 * lockers handled correctly. fixes from bug 20 will make it
854 * more efficient by associating locks with pages and with
855 * batching writeback under the lock explicitly. */
856 for (i = start, j = start % count; i <= end;
857 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
859 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
865 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
866 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
867 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
870 if (!mapping_has_pages(mapping)) {
871 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
877 page = find_lock_page(mapping, i);
880 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
881 i, tmpex.l_extent.start);
882 if (!discard && PageWriteback(page))
883 wait_on_page_writeback(page);
885 /* page->mapping to check with racing against teardown */
886 if (!discard && clear_page_dirty_for_io(page)) {
887 rc = ll_call_writepage(inode, page);
888 /* either waiting for io to complete or reacquiring
889 * the lock that the failed writepage released */
891 wait_on_page_writeback(page);
893 CERROR("writepage inode %lu(%p) of page %p "
894 "failed: %d\n", inode->i_ino, inode,
897 set_bit(AS_ENOSPC, &mapping->flags);
899 set_bit(AS_EIO, &mapping->flags);
903 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
904 /* check to see if another DLM lock covers this page b=2765 */
905 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
906 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
908 &lock->l_resource->lr_name, LDLM_EXTENT,
909 &tmpex, LCK_PR | LCK_PW, &lockh);
911 if (rc2 <= 0 && page->mapping != NULL) {
912 struct ll_async_page *llap = llap_cast_private(page);
913 /* checking again to account for writeback's
915 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
917 ll_ra_accounting(llap, mapping);
918 ll_truncate_complete_page(page);
921 page_cache_release(page);
923 LASSERTF(tmpex.l_extent.start <=
924 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
925 lock->l_policy_data.l_extent.end + 1),
926 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
927 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
932 static int ll_extent_lock_callback(struct ldlm_lock *lock,
933 struct ldlm_lock_desc *new, void *data,
936 struct lustre_handle lockh = { 0 };
940 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
941 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
946 case LDLM_CB_BLOCKING:
947 ldlm_lock2handle(lock, &lockh);
948 rc = ldlm_cli_cancel(&lockh);
950 CERROR("ldlm_cli_cancel failed: %d\n", rc);
952 case LDLM_CB_CANCELING: {
954 struct ll_inode_info *lli;
955 struct lov_stripe_md *lsm;
959 /* This lock wasn't granted, don't try to evict pages */
960 if (lock->l_req_mode != lock->l_granted_mode)
963 inode = ll_inode_from_lock(lock);
966 lli = ll_i2info(inode);
969 if (lli->lli_smd == NULL)
973 stripe = ll_lock_to_stripe_offset(inode, lock);
977 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
979 lov_stripe_lock(lsm);
980 lock_res_and_lock(lock);
981 kms = ldlm_extent_shift_kms(lock,
982 lsm->lsm_oinfo[stripe]->loi_kms);
984 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
985 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
986 lsm->lsm_oinfo[stripe]->loi_kms, kms);
987 lsm->lsm_oinfo[stripe]->loi_kms = kms;
988 unlock_res_and_lock(lock);
989 lov_stripe_unlock(lsm);
1002 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
1004 /* XXX ALLOCATE - 160 bytes */
1005 struct inode *inode = ll_inode_from_lock(lock);
1006 struct ll_inode_info *lli = ll_i2info(inode);
1007 struct lustre_handle lockh = { 0 };
1008 struct ost_lvb *lvb;
1012 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
1013 LDLM_FL_BLOCK_CONV)) {
1014 LBUG(); /* not expecting any blocked async locks yet */
1015 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
1017 ldlm_lock_dump(D_OTHER, lock, 0);
1018 ldlm_reprocess_all(lock->l_resource);
1022 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
1024 stripe = ll_lock_to_stripe_offset(inode, lock);
1028 if (lock->l_lvb_len) {
1029 struct lov_stripe_md *lsm = lli->lli_smd;
1031 lvb = lock->l_lvb_data;
1032 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
1034 lock_res_and_lock(lock);
1035 ll_inode_size_lock(inode, 1);
1036 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
1037 kms = ldlm_extent_shift_kms(NULL, kms);
1038 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
1039 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
1040 lsm->lsm_oinfo[stripe].loi_kms, kms);
1041 lsm->lsm_oinfo[stripe].loi_kms = kms;
1042 ll_inode_size_unlock(inode, 1);
1043 unlock_res_and_lock(lock);
1048 wake_up(&lock->l_waitq);
1050 ldlm_lock2handle(lock, &lockh);
1051 ldlm_lock_decref(&lockh, LCK_PR);
1056 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1058 struct ptlrpc_request *req = reqp;
1059 struct inode *inode = ll_inode_from_lock(lock);
1060 struct ll_inode_info *lli;
1061 struct lov_stripe_md *lsm;
1062 struct ost_lvb *lvb;
1067 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1068 lli = ll_i2info(inode);
1070 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1073 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1075 /* First, find out which stripe index this lock corresponds to. */
1076 stripe = ll_lock_to_stripe_offset(inode, lock);
1078 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1080 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
1081 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
1083 rc = req_capsule_server_pack(&req->rq_pill);
1085 CERROR("lustre_pack_reply: %d\n", rc);
1089 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
1090 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1091 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1092 lvb->lvb_atime = LTIME_S(inode->i_atime);
1093 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1095 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1096 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1097 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1098 lvb->lvb_atime, lvb->lvb_ctime);
1103 /* These errors are normal races, so we don't want to fill the console
1104 * with messages by calling ptlrpc_error() */
1105 if (rc == -ELDLM_NO_LOCK_DATA)
1106 lustre_pack_reply(req, 1, NULL, NULL);
1108 req->rq_status = rc;
1112 static int ll_merge_lvb(struct inode *inode)
1114 struct ll_inode_info *lli = ll_i2info(inode);
1115 struct ll_sb_info *sbi = ll_i2sbi(inode);
1121 ll_inode_size_lock(inode, 1);
1122 inode_init_lvb(inode, &lvb);
1123 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1124 i_size_write(inode, lvb.lvb_size);
1125 inode->i_blocks = lvb.lvb_blocks;
1127 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1128 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1129 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1130 ll_inode_size_unlock(inode, 1);
1135 int ll_local_size(struct inode *inode)
1137 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1138 struct ll_inode_info *lli = ll_i2info(inode);
1139 struct ll_sb_info *sbi = ll_i2sbi(inode);
1140 struct lustre_handle lockh = { 0 };
1145 if (lli->lli_smd->lsm_stripe_count == 0)
1148 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1149 &policy, LCK_PR, &flags, inode, &lockh);
1155 rc = ll_merge_lvb(inode);
1156 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1160 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1163 struct lustre_handle lockh = { 0 };
1164 struct ldlm_enqueue_info einfo = { 0 };
1165 struct obd_info oinfo = { { { 0 } } };
1171 einfo.ei_type = LDLM_EXTENT;
1172 einfo.ei_mode = LCK_PR;
1173 einfo.ei_cb_bl = ll_extent_lock_callback;
1174 einfo.ei_cb_cp = ldlm_completion_ast;
1175 einfo.ei_cb_gl = ll_glimpse_callback;
1176 einfo.ei_cbdata = NULL;
1178 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1179 oinfo.oi_lockh = &lockh;
1181 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1183 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1187 CERROR("obd_enqueue returned rc %d, "
1188 "returning -EIO\n", rc);
1189 RETURN(rc > 0 ? -EIO : rc);
1192 lov_stripe_lock(lsm);
1193 memset(&lvb, 0, sizeof(lvb));
1194 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1195 st->st_size = lvb.lvb_size;
1196 st->st_blocks = lvb.lvb_blocks;
1197 st->st_mtime = lvb.lvb_mtime;
1198 st->st_atime = lvb.lvb_atime;
1199 st->st_ctime = lvb.lvb_ctime;
1200 lov_stripe_unlock(lsm);
1205 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1206 * file (because it prefers KMS over RSS when larger) */
1207 int ll_glimpse_size(struct inode *inode, int ast_flags)
1209 struct ll_inode_info *lli = ll_i2info(inode);
1210 struct ll_sb_info *sbi = ll_i2sbi(inode);
1211 struct lustre_handle lockh = { 0 };
1212 struct ldlm_enqueue_info einfo = { 0 };
1213 struct obd_info oinfo = { { { 0 } } };
1217 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1220 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1222 if (!lli->lli_smd) {
1223 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1227 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1228 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1229 * won't revoke any conflicting DLM locks held. Instead,
1230 * ll_glimpse_callback() will be called on each client
1231 * holding a DLM lock against this file, and resulting size
1232 * will be returned for each stripe. DLM lock on [0, EOF] is
1233 * acquired only if there were no conflicting locks. */
1234 einfo.ei_type = LDLM_EXTENT;
1235 einfo.ei_mode = LCK_PR;
1236 einfo.ei_cb_bl = ll_extent_lock_callback;
1237 einfo.ei_cb_cp = ldlm_completion_ast;
1238 einfo.ei_cb_gl = ll_glimpse_callback;
1239 einfo.ei_cbdata = inode;
1241 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1242 oinfo.oi_lockh = &lockh;
1243 oinfo.oi_md = lli->lli_smd;
1244 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1246 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1250 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1251 RETURN(rc > 0 ? -EIO : rc);
1254 rc = ll_merge_lvb(inode);
1256 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1257 i_size_read(inode), (unsigned long long)inode->i_blocks);
1262 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1263 struct lov_stripe_md *lsm, int mode,
1264 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1267 struct ll_sb_info *sbi = ll_i2sbi(inode);
1269 struct ldlm_enqueue_info einfo = { 0 };
1270 struct obd_info oinfo = { { { 0 } } };
1274 LASSERT(!lustre_handle_is_used(lockh));
1275 LASSERT(lsm != NULL);
1277 /* don't drop the mmapped file to LRU */
1278 if (mapping_mapped(inode->i_mapping))
1279 ast_flags |= LDLM_FL_NO_LRU;
1281 /* XXX phil: can we do this? won't it screw the file size up? */
1282 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1283 (sbi->ll_flags & LL_SBI_NOLCK))
1286 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1287 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1289 einfo.ei_type = LDLM_EXTENT;
1290 einfo.ei_mode = mode;
1291 einfo.ei_cb_bl = ll_extent_lock_callback;
1292 einfo.ei_cb_cp = ldlm_completion_ast;
1293 einfo.ei_cb_gl = ll_glimpse_callback;
1294 einfo.ei_cbdata = inode;
1296 oinfo.oi_policy = *policy;
1297 oinfo.oi_lockh = lockh;
1299 oinfo.oi_flags = ast_flags;
1301 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1302 *policy = oinfo.oi_policy;
1306 ll_inode_size_lock(inode, 1);
1307 inode_init_lvb(inode, &lvb);
1308 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1310 if (policy->l_extent.start == 0 &&
1311 policy->l_extent.end == OBD_OBJECT_EOF) {
1312 /* vmtruncate()->ll_truncate() first sets the i_size and then
1313 * the kms under both a DLM lock and the
1314 * ll_inode_size_lock(). If we don't get the
1315 * ll_inode_size_lock() here we can match the DLM lock and
1316 * reset i_size from the kms before the truncating path has
1317 * updated the kms. generic_file_write can then trust the
1318 * stale i_size when doing appending writes and effectively
1319 * cancel the result of the truncate. Getting the
1320 * ll_inode_size_lock() after the enqueue maintains the DLM
1321 * -> ll_inode_size_lock() acquiring order. */
1322 i_size_write(inode, lvb.lvb_size);
1323 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1324 inode->i_ino, i_size_read(inode));
1328 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1329 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1330 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1332 ll_inode_size_unlock(inode, 1);
1337 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1338 struct lov_stripe_md *lsm, int mode,
1339 struct lustre_handle *lockh)
1341 struct ll_sb_info *sbi = ll_i2sbi(inode);
1345 /* XXX phil: can we do this? won't it screw the file size up? */
1346 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1347 (sbi->ll_flags & LL_SBI_NOLCK))
1350 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1355 static void ll_set_file_contended(struct inode *inode)
1357 struct ll_inode_info *lli = ll_i2info(inode);
1358 cfs_time_t now = cfs_time_current();
1360 spin_lock(&lli->lli_lock);
1361 lli->lli_contention_time = now;
1362 lli->lli_flags |= LLIF_CONTENDED;
1363 spin_unlock(&lli->lli_lock);
1366 void ll_clear_file_contended(struct inode *inode)
1368 struct ll_inode_info *lli = ll_i2info(inode);
1370 spin_lock(&lli->lli_lock);
1371 lli->lli_flags &= ~LLIF_CONTENDED;
1372 spin_unlock(&lli->lli_lock);
1375 static int ll_is_file_contended(struct file *file)
1377 struct inode *inode = file->f_dentry->d_inode;
1378 struct ll_inode_info *lli = ll_i2info(inode);
1379 struct ll_sb_info *sbi = ll_i2sbi(inode);
1380 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1383 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1384 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1385 " osc connect flags = 0x"LPX64"\n",
1386 sbi->ll_lco.lco_flags);
1389 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1391 if (lli->lli_flags & LLIF_CONTENDED) {
1392 cfs_time_t cur_time = cfs_time_current();
1393 cfs_time_t retry_time;
1395 retry_time = cfs_time_add(
1396 lli->lli_contention_time,
1397 cfs_time_seconds(sbi->ll_contention_time));
1398 if (cfs_time_after(cur_time, retry_time)) {
1399 ll_clear_file_contended(inode);
1407 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1408 const char *buf, size_t count,
1409 loff_t start, loff_t end, int rw)
1412 int tree_locked = 0;
1414 struct inode * inode = file->f_dentry->d_inode;
1417 append = (rw == WRITE) && (file->f_flags & O_APPEND);
1419 if (append || !ll_is_file_contended(file)) {
1420 struct ll_lock_tree_node *node;
1423 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1424 if (file->f_flags & O_NONBLOCK)
1425 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1426 node = ll_node_from_inode(inode, start, end,
1427 (rw == WRITE) ? LCK_PW : LCK_PR);
1432 tree->lt_fd = LUSTRE_FPRIVATE(file);
1433 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1436 else if (rc == -EUSERS)
1437 ll_set_file_contended(inode);
1441 RETURN(tree_locked);
1446 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1449 struct inode *inode = file->f_dentry->d_inode;
1450 struct ll_inode_info *lli = ll_i2info(inode);
1451 struct lov_stripe_md *lsm = lli->lli_smd;
1452 struct ll_sb_info *sbi = ll_i2sbi(inode);
1453 struct ll_lock_tree tree;
1455 struct ll_ra_read bead;
1458 ssize_t retval, chunk, sum = 0;
1463 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1464 inode->i_ino, inode->i_generation, inode, count, *ppos);
1465 /* "If nbyte is 0, read() will return 0 and have no other results."
1466 * -- Single Unix Spec */
1470 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1473 /* Read on file with no objects should return zero-filled
1474 * buffers up to file size (we can get non-zero sizes with
1475 * mknod + truncate, then opening file for read. This is a
1476 * common pattern in NFS case, it seems). Bug 6243 */
1478 /* Since there are no objects on OSTs, we have nothing to get
1479 * lock on and so we are forced to access inode->i_size
1482 /* Read beyond end of file */
1483 if (*ppos >= i_size_read(inode))
1486 if (count > i_size_read(inode) - *ppos)
1487 count = i_size_read(inode) - *ppos;
1488 /* Make sure to correctly adjust the file pos pointer for
1490 notzeroed = clear_user(buf, count);
1498 if (sbi->ll_max_rw_chunk != 0) {
1499 /* first, let's know the end of the current stripe */
1501 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1504 /* correct, the end is beyond the request */
1505 if (end > *ppos + count - 1)
1506 end = *ppos + count - 1;
1508 /* and chunk shouldn't be too large even if striping is wide */
1509 if (end - *ppos > sbi->ll_max_rw_chunk)
1510 end = *ppos + sbi->ll_max_rw_chunk - 1;
1512 end = *ppos + count - 1;
1515 tree_locked = ll_file_get_tree_lock(&tree, file, buf,
1516 count, *ppos, end, READ);
1517 if (tree_locked < 0)
1518 GOTO(out, retval = tree_locked);
1520 ll_inode_size_lock(inode, 1);
1522 * Consistency guarantees: following possibilities exist for the
1523 * relation between region being read and real file size at this
1526 * (A): the region is completely inside of the file;
1528 * (B-x): x bytes of region are inside of the file, the rest is
1531 * (C): the region is completely outside of the file.
1533 * This classification is stable under DLM lock acquired by
1534 * ll_tree_lock() above, because to change class, other client has to
1535 * take DLM lock conflicting with our lock. Also, any updates to
1536 * ->i_size by other threads on this client are serialized by
1537 * ll_inode_size_lock(). This guarantees that short reads are handled
1538 * correctly in the face of concurrent writes and truncates.
1540 inode_init_lvb(inode, &lvb);
1541 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1543 if (*ppos + count - 1 > kms) {
1544 /* A glimpse is necessary to determine whether we return a
1545 * short read (B) or some zeroes at the end of the buffer (C) */
1546 ll_inode_size_unlock(inode, 1);
1547 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1550 ll_tree_unlock(&tree);
1554 /* region is within kms and, hence, within real file size (A).
1555 * We need to increase i_size to cover the read region so that
1556 * generic_file_read() will do its job, but that doesn't mean
1557 * the kms size is _correct_, it is only the _minimum_ size.
1558 * If someone does a stat they will get the correct size which
1559 * will always be >= the kms value here. b=11081 */
1560 if (i_size_read(inode) < kms)
1561 i_size_write(inode, kms);
1562 ll_inode_size_unlock(inode, 1);
1565 chunk = end - *ppos + 1;
1566 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1567 inode->i_ino, chunk, *ppos, i_size_read(inode));
1570 /* turn off the kernel's read-ahead */
1571 file->f_ra.ra_pages = 0;
1573 /* initialize read-ahead window once per syscall */
1576 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1577 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1578 ll_ra_read_in(file, &bead);
1582 file_accessed(file);
1583 retval = generic_file_read(file, buf, chunk, ppos);
1584 ll_tree_unlock(&tree);
1586 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1589 ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1595 if (retval == chunk && count > 0)
1601 ll_ra_read_ex(file, &bead);
1602 retval = (sum > 0) ? sum : retval;
1607 * Write to a file (through the page cache).
1609 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1612 struct inode *inode = file->f_dentry->d_inode;
1613 struct ll_sb_info *sbi = ll_i2sbi(inode);
1614 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1615 struct ll_lock_tree tree;
1616 loff_t maxbytes = ll_file_maxbytes(inode);
1617 loff_t lock_start, lock_end, end;
1618 ssize_t retval, chunk, sum = 0;
1622 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1623 inode->i_ino, inode->i_generation, inode, count, *ppos);
1625 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1627 /* POSIX, but surprised the VFS doesn't check this already */
1631 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1632 * called on the file, don't fail the below assertion (bug 2388). */
1633 if (file->f_flags & O_LOV_DELAY_CREATE &&
1634 ll_i2info(inode)->lli_smd == NULL)
1637 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1639 down(&ll_i2info(inode)->lli_write_sem);
1642 chunk = 0; /* just to fix gcc's warning */
1643 end = *ppos + count - 1;
1645 if (file->f_flags & O_APPEND) {
1647 lock_end = OBD_OBJECT_EOF;
1648 } else if (sbi->ll_max_rw_chunk != 0) {
1649 /* first, let's know the end of the current stripe */
1651 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1654 /* correct, the end is beyond the request */
1655 if (end > *ppos + count - 1)
1656 end = *ppos + count - 1;
1658 /* and chunk shouldn't be too large even if striping is wide */
1659 if (end - *ppos > sbi->ll_max_rw_chunk)
1660 end = *ppos + sbi->ll_max_rw_chunk - 1;
1665 lock_end = *ppos + count - 1;
1668 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1669 lock_start, lock_end, WRITE);
1670 if (tree_locked < 0)
1671 GOTO(out, retval = tree_locked);
1673 /* This is ok, g_f_w will overwrite this under i_sem if it races
1674 * with a local truncate, it just makes our maxbyte checking easier.
1675 * The i_size value gets updated in ll_extent_lock() as a consequence
1676 * of the [0,EOF] extent lock we requested above. */
1677 if (file->f_flags & O_APPEND) {
1678 *ppos = i_size_read(inode);
1679 end = *ppos + count - 1;
1682 if (*ppos >= maxbytes) {
1683 send_sig(SIGXFSZ, current, 0);
1684 GOTO(out_unlock, retval = -EFBIG);
1686 if (end > maxbytes - 1)
1689 /* generic_file_write handles O_APPEND after getting i_mutex */
1690 chunk = end - *ppos + 1;
1691 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1692 inode->i_ino, chunk, *ppos);
1694 retval = generic_file_write(file, buf, chunk, ppos);
1696 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1698 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1702 ll_tree_unlock(&tree);
1709 if (retval == chunk && count > 0)
1713 up(&ll_i2info(inode)->lli_write_sem);
1715 retval = (sum > 0) ? sum : retval;
1716 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1717 retval > 0 ? retval : 0);
1722 * Send file content (through pagecache) somewhere with helper
1724 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1725 read_actor_t actor, void *target)
1727 struct inode *inode = in_file->f_dentry->d_inode;
1728 struct ll_inode_info *lli = ll_i2info(inode);
1729 struct lov_stripe_md *lsm = lli->lli_smd;
1730 struct ll_lock_tree tree;
1731 struct ll_lock_tree_node *node;
1733 struct ll_ra_read bead;
1738 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1739 inode->i_ino, inode->i_generation, inode, count, *ppos);
1741 /* "If nbyte is 0, read() will return 0 and have no other results."
1742 * -- Single Unix Spec */
1746 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1747 /* turn off the kernel's read-ahead */
1748 in_file->f_ra.ra_pages = 0;
1750 /* File with no objects, nothing to lock */
1752 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1754 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1756 RETURN(PTR_ERR(node));
1758 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1759 rc = ll_tree_lock(&tree, node, NULL, count,
1760 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1764 ll_clear_file_contended(inode);
1765 ll_inode_size_lock(inode, 1);
1767 * Consistency guarantees: following possibilities exist for the
1768 * relation between region being read and real file size at this
1771 * (A): the region is completely inside of the file;
1773 * (B-x): x bytes of region are inside of the file, the rest is
1776 * (C): the region is completely outside of the file.
1778 * This classification is stable under DLM lock acquired by
1779 * ll_tree_lock() above, because to change class, other client has to
1780 * take DLM lock conflicting with our lock. Also, any updates to
1781 * ->i_size by other threads on this client are serialized by
1782 * ll_inode_size_lock(). This guarantees that short reads are handled
1783 * correctly in the face of concurrent writes and truncates.
1785 inode_init_lvb(inode, &lvb);
1786 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1788 if (*ppos + count - 1 > kms) {
1789 /* A glimpse is necessary to determine whether we return a
1790 * short read (B) or some zeroes at the end of the buffer (C) */
1791 ll_inode_size_unlock(inode, 1);
1792 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1796 /* region is within kms and, hence, within real file size (A) */
1797 i_size_write(inode, kms);
1798 ll_inode_size_unlock(inode, 1);
1801 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1802 inode->i_ino, count, *ppos, i_size_read(inode));
1804 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1805 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1806 ll_ra_read_in(in_file, &bead);
1808 file_accessed(in_file);
1809 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1810 ll_ra_read_ex(in_file, &bead);
1813 ll_tree_unlock(&tree);
1817 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1820 struct ll_inode_info *lli = ll_i2info(inode);
1821 struct obd_export *exp = ll_i2dtexp(inode);
1822 struct ll_recreate_obj ucreatp;
1823 struct obd_trans_info oti = { 0 };
1824 struct obdo *oa = NULL;
1827 struct lov_stripe_md *lsm, *lsm2;
1830 if (!capable (CAP_SYS_ADMIN))
1833 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1834 sizeof(struct ll_recreate_obj));
1842 down(&lli->lli_size_sem);
1845 GOTO(out, rc = -ENOENT);
1846 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1847 (lsm->lsm_stripe_count));
1849 OBD_ALLOC(lsm2, lsm_size);
1851 GOTO(out, rc = -ENOMEM);
1853 oa->o_id = ucreatp.lrc_id;
1854 oa->o_gr = ucreatp.lrc_group;
1855 oa->o_nlink = ucreatp.lrc_ost_idx;
1856 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1857 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1858 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1859 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1861 memcpy(lsm2, lsm, lsm_size);
1862 rc = obd_create(exp, oa, &lsm2, &oti);
1864 OBD_FREE(lsm2, lsm_size);
1867 up(&lli->lli_size_sem);
1872 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1873 int flags, struct lov_user_md *lum, int lum_size)
1875 struct ll_inode_info *lli = ll_i2info(inode);
1876 struct lov_stripe_md *lsm;
1877 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1881 down(&lli->lli_size_sem);
1884 up(&lli->lli_size_sem);
1885 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1890 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1893 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1894 GOTO(out_req_free, rc = -ENOENT);
1895 rc = oit.d.lustre.it_status;
1897 GOTO(out_req_free, rc);
1899 ll_release_openhandle(file->f_dentry, &oit);
1902 up(&lli->lli_size_sem);
1903 ll_intent_release(&oit);
1906 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1910 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1911 struct lov_mds_md **lmmp, int *lmm_size,
1912 struct ptlrpc_request **request)
1914 struct ll_sb_info *sbi = ll_i2sbi(inode);
1915 struct mdt_body *body;
1916 struct lov_mds_md *lmm = NULL;
1917 struct ptlrpc_request *req = NULL;
1918 struct obd_capa *oc;
1921 rc = ll_get_max_mdsize(sbi, &lmmsize);
1925 oc = ll_mdscapa_get(inode);
1926 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1927 oc, filename, strlen(filename) + 1,
1928 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1929 ll_i2suppgid(inode), &req);
1932 CDEBUG(D_INFO, "md_getattr_name failed "
1933 "on %s: rc %d\n", filename, rc);
1937 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1938 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1940 lmmsize = body->eadatasize;
1942 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1944 GOTO(out, rc = -ENODATA);
1947 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1948 LASSERT(lmm != NULL);
1951 * This is coming from the MDS, so is probably in
1952 * little endian. We convert it to host endian before
1953 * passing it to userspace.
1955 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1956 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1957 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1958 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1959 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1962 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1963 struct lov_stripe_md *lsm;
1964 struct lov_user_md_join *lmj;
1965 int lmj_size, i, aindex = 0;
1967 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1969 GOTO(out, rc = -ENOMEM);
1970 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1972 GOTO(out_free_memmd, rc);
1974 lmj_size = sizeof(struct lov_user_md_join) +
1975 lsm->lsm_stripe_count *
1976 sizeof(struct lov_user_ost_data_join);
1977 OBD_ALLOC(lmj, lmj_size);
1979 GOTO(out_free_memmd, rc = -ENOMEM);
1981 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1982 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1983 struct lov_extent *lex =
1984 &lsm->lsm_array->lai_ext_array[aindex];
1986 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1988 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1989 LPU64" len %d\n", aindex, i,
1990 lex->le_start, (int)lex->le_len);
1991 lmj->lmm_objects[i].l_extent_start =
1994 if ((int)lex->le_len == -1)
1995 lmj->lmm_objects[i].l_extent_end = -1;
1997 lmj->lmm_objects[i].l_extent_end =
1998 lex->le_start + lex->le_len;
1999 lmj->lmm_objects[i].l_object_id =
2000 lsm->lsm_oinfo[i]->loi_id;
2001 lmj->lmm_objects[i].l_object_gr =
2002 lsm->lsm_oinfo[i]->loi_gr;
2003 lmj->lmm_objects[i].l_ost_gen =
2004 lsm->lsm_oinfo[i]->loi_ost_gen;
2005 lmj->lmm_objects[i].l_ost_idx =
2006 lsm->lsm_oinfo[i]->loi_ost_idx;
2008 lmm = (struct lov_mds_md *)lmj;
2011 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2015 *lmm_size = lmmsize;
2020 static int ll_lov_setea(struct inode *inode, struct file *file,
2023 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2024 struct lov_user_md *lump;
2025 int lum_size = sizeof(struct lov_user_md) +
2026 sizeof(struct lov_user_ost_data);
2030 if (!capable (CAP_SYS_ADMIN))
2033 OBD_ALLOC(lump, lum_size);
2037 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2039 OBD_FREE(lump, lum_size);
2043 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2045 OBD_FREE(lump, lum_size);
2049 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2052 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
2054 int flags = FMODE_WRITE;
2057 /* Bug 1152: copy properly when this is no longer true */
2058 LASSERT(sizeof(lum) == sizeof(*lump));
2059 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
2060 rc = copy_from_user(&lum, lump, sizeof(lum));
2064 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
2066 put_user(0, &lump->lmm_stripe_count);
2067 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2068 0, ll_i2info(inode)->lli_smd, lump);
2073 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2075 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2080 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2084 static int ll_get_grouplock(struct inode *inode, struct file *file,
2087 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2088 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2089 .end = OBD_OBJECT_EOF}};
2090 struct lustre_handle lockh = { 0 };
2091 struct ll_inode_info *lli = ll_i2info(inode);
2092 struct lov_stripe_md *lsm = lli->lli_smd;
2096 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2100 policy.l_extent.gid = arg;
2101 if (file->f_flags & O_NONBLOCK)
2102 flags = LDLM_FL_BLOCK_NOWAIT;
2104 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2108 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2110 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2115 static int ll_put_grouplock(struct inode *inode, struct file *file,
2118 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2119 struct ll_inode_info *lli = ll_i2info(inode);
2120 struct lov_stripe_md *lsm = lli->lli_smd;
2124 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2125 /* Ugh, it's already unlocked. */
2129 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2132 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2134 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2139 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2144 static int join_sanity_check(struct inode *head, struct inode *tail)
2147 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2148 CERROR("server do not support join \n");
2151 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2152 CERROR("tail ino %lu and ino head %lu must be regular\n",
2153 head->i_ino, tail->i_ino);
2156 if (head->i_ino == tail->i_ino) {
2157 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2160 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2161 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2167 static int join_file(struct inode *head_inode, struct file *head_filp,
2168 struct file *tail_filp)
2170 struct dentry *tail_dentry = tail_filp->f_dentry;
2171 struct lookup_intent oit = {.it_op = IT_OPEN,
2172 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2173 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2174 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2176 struct lustre_handle lockh;
2177 struct md_op_data *op_data;
2182 tail_dentry = tail_filp->f_dentry;
2184 data = i_size_read(head_inode);
2185 op_data = ll_prep_md_op_data(NULL, head_inode,
2186 tail_dentry->d_parent->d_inode,
2187 tail_dentry->d_name.name,
2188 tail_dentry->d_name.len, 0,
2189 LUSTRE_OPC_ANY, &data);
2190 if (IS_ERR(op_data))
2191 RETURN(PTR_ERR(op_data));
2193 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2194 op_data, &lockh, NULL, 0, 0);
2196 ll_finish_md_op_data(op_data);
2200 rc = oit.d.lustre.it_status;
2202 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2203 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2204 ptlrpc_req_finished((struct ptlrpc_request *)
2205 oit.d.lustre.it_data);
2209 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2211 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2212 oit.d.lustre.it_lock_mode = 0;
2214 ll_release_openhandle(head_filp->f_dentry, &oit);
2216 ll_intent_release(&oit);
2220 static int ll_file_join(struct inode *head, struct file *filp,
2221 char *filename_tail)
2223 struct inode *tail = NULL, *first = NULL, *second = NULL;
2224 struct dentry *tail_dentry;
2225 struct file *tail_filp, *first_filp, *second_filp;
2226 struct ll_lock_tree first_tree, second_tree;
2227 struct ll_lock_tree_node *first_node, *second_node;
2228 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2229 int rc = 0, cleanup_phase = 0;
2232 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2233 head->i_ino, head->i_generation, head, filename_tail);
2235 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2236 if (IS_ERR(tail_filp)) {
2237 CERROR("Can not open tail file %s", filename_tail);
2238 rc = PTR_ERR(tail_filp);
2241 tail = igrab(tail_filp->f_dentry->d_inode);
2243 tlli = ll_i2info(tail);
2244 tail_dentry = tail_filp->f_dentry;
2245 LASSERT(tail_dentry);
2248 /*reorder the inode for lock sequence*/
2249 first = head->i_ino > tail->i_ino ? head : tail;
2250 second = head->i_ino > tail->i_ino ? tail : head;
2251 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2252 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2254 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2255 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2256 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2257 if (IS_ERR(first_node)){
2258 rc = PTR_ERR(first_node);
2261 first_tree.lt_fd = first_filp->private_data;
2262 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2267 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2268 if (IS_ERR(second_node)){
2269 rc = PTR_ERR(second_node);
2272 second_tree.lt_fd = second_filp->private_data;
2273 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2278 rc = join_sanity_check(head, tail);
2282 rc = join_file(head, filp, tail_filp);
2286 switch (cleanup_phase) {
2288 ll_tree_unlock(&second_tree);
2289 obd_cancel_unused(ll_i2dtexp(second),
2290 ll_i2info(second)->lli_smd, 0, NULL);
2292 ll_tree_unlock(&first_tree);
2293 obd_cancel_unused(ll_i2dtexp(first),
2294 ll_i2info(first)->lli_smd, 0, NULL);
2296 filp_close(tail_filp, 0);
2299 if (head && rc == 0) {
2300 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2302 hlli->lli_smd = NULL;
2307 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2313 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2315 struct inode *inode = dentry->d_inode;
2316 struct obd_client_handle *och;
2322 /* Root ? Do nothing. */
2323 if (dentry->d_inode->i_sb->s_root == dentry)
2326 /* No open handle to close? Move away */
2327 if (!it_disposition(it, DISP_OPEN_OPEN))
2330 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2332 OBD_ALLOC(och, sizeof(*och));
2334 GOTO(out, rc = -ENOMEM);
2336 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2337 ll_i2info(inode), it, och);
2339 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2342 /* this one is in place of ll_file_open */
2343 ptlrpc_req_finished(it->d.lustre.it_data);
2344 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2348 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2355 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2356 inode->i_generation, inode, cmd);
2357 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2359 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2360 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2364 case LL_IOC_GETFLAGS:
2365 /* Get the current value of the file flags */
2366 return put_user(fd->fd_flags, (int *)arg);
2367 case LL_IOC_SETFLAGS:
2368 case LL_IOC_CLRFLAGS:
2369 /* Set or clear specific file flags */
2370 /* XXX This probably needs checks to ensure the flags are
2371 * not abused, and to handle any flag side effects.
2373 if (get_user(flags, (int *) arg))
2376 if (cmd == LL_IOC_SETFLAGS) {
2377 if ((flags & LL_FILE_IGNORE_LOCK) &&
2378 !(file->f_flags & O_DIRECT)) {
2379 CERROR("%s: unable to disable locking on "
2380 "non-O_DIRECT file\n", current->comm);
2384 fd->fd_flags |= flags;
2386 fd->fd_flags &= ~flags;
2389 case LL_IOC_LOV_SETSTRIPE:
2390 RETURN(ll_lov_setstripe(inode, file, arg));
2391 case LL_IOC_LOV_SETEA:
2392 RETURN(ll_lov_setea(inode, file, arg));
2393 case LL_IOC_LOV_GETSTRIPE:
2394 RETURN(ll_lov_getstripe(inode, arg));
2395 case LL_IOC_RECREATE_OBJ:
2396 RETURN(ll_lov_recreate_obj(inode, file, arg));
2397 case EXT3_IOC_GETFLAGS:
2398 case EXT3_IOC_SETFLAGS:
2399 RETURN(ll_iocontrol(inode, file, cmd, arg));
2400 case EXT3_IOC_GETVERSION_OLD:
2401 case EXT3_IOC_GETVERSION:
2402 RETURN(put_user(inode->i_generation, (int *)arg));
2407 ftail = getname((const char *)arg);
2409 RETURN(PTR_ERR(ftail));
2410 rc = ll_file_join(inode, file, ftail);
2414 case LL_IOC_GROUP_LOCK:
2415 RETURN(ll_get_grouplock(inode, file, arg));
2416 case LL_IOC_GROUP_UNLOCK:
2417 RETURN(ll_put_grouplock(inode, file, arg));
2418 case IOC_OBD_STATFS:
2419 RETURN(ll_obd_statfs(inode, (void *)arg));
2421 /* We need to special case any other ioctls we want to handle,
2422 * to send them to the MDS/OST as appropriate and to properly
2423 * network encode the arg field.
2424 case EXT3_IOC_SETVERSION_OLD:
2425 case EXT3_IOC_SETVERSION:
2427 case LL_IOC_FLUSHCTX:
2428 RETURN(ll_flush_ctx(inode));
2433 ll_iocontrol_call(inode, file, cmd, arg, &err))
2436 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2442 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2444 struct inode *inode = file->f_dentry->d_inode;
2445 struct ll_inode_info *lli = ll_i2info(inode);
2446 struct lov_stripe_md *lsm = lli->lli_smd;
2449 retval = offset + ((origin == 2) ? i_size_read(inode) :
2450 (origin == 1) ? file->f_pos : 0);
2451 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2452 inode->i_ino, inode->i_generation, inode, retval, retval,
2453 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2454 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2456 if (origin == 2) { /* SEEK_END */
2457 int nonblock = 0, rc;
2459 if (file->f_flags & O_NONBLOCK)
2460 nonblock = LDLM_FL_BLOCK_NOWAIT;
2463 rc = ll_glimpse_size(inode, nonblock);
2468 ll_inode_size_lock(inode, 0);
2469 offset += i_size_read(inode);
2470 ll_inode_size_unlock(inode, 0);
2471 } else if (origin == 1) { /* SEEK_CUR */
2472 offset += file->f_pos;
2476 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2477 if (offset != file->f_pos) {
2478 file->f_pos = offset;
2479 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2481 file->f_version = ++event;
2490 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2492 struct inode *inode = dentry->d_inode;
2493 struct ll_inode_info *lli = ll_i2info(inode);
2494 struct lov_stripe_md *lsm = lli->lli_smd;
2495 struct ptlrpc_request *req;
2496 struct obd_capa *oc;
2499 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2500 inode->i_generation, inode);
2501 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2503 /* fsync's caller has already called _fdata{sync,write}, we want
2504 * that IO to finish before calling the osc and mdc sync methods */
2505 rc = filemap_fdatawait(inode->i_mapping);
2507 /* catch async errors that were recorded back when async writeback
2508 * failed for pages in this mapping. */
2509 err = lli->lli_async_rc;
2510 lli->lli_async_rc = 0;
2514 err = lov_test_and_clear_async_rc(lsm);
2519 oc = ll_mdscapa_get(inode);
2520 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2526 ptlrpc_req_finished(req);
2533 RETURN(rc ? rc : -ENOMEM);
2535 oa->o_id = lsm->lsm_object_id;
2536 oa->o_gr = lsm->lsm_object_gr;
2537 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2538 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2539 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2542 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2543 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2544 0, OBD_OBJECT_EOF, oc);
2554 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2556 struct inode *inode = file->f_dentry->d_inode;
2557 struct ll_sb_info *sbi = ll_i2sbi(inode);
2558 struct ldlm_res_id res_id =
2559 { .name = { fid_seq(ll_inode2fid(inode)),
2560 fid_oid(ll_inode2fid(inode)),
2561 fid_ver(ll_inode2fid(inode)),
2563 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2564 ldlm_flock_completion_ast, NULL, file_lock };
2565 struct lustre_handle lockh = {0};
2566 ldlm_policy_data_t flock;
2571 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2572 inode->i_ino, file_lock);
2574 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2576 if (file_lock->fl_flags & FL_FLOCK) {
2577 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2578 /* set missing params for flock() calls */
2579 file_lock->fl_end = OFFSET_MAX;
2580 file_lock->fl_pid = current->tgid;
2582 flock.l_flock.pid = file_lock->fl_pid;
2583 flock.l_flock.start = file_lock->fl_start;
2584 flock.l_flock.end = file_lock->fl_end;
2586 switch (file_lock->fl_type) {
2588 einfo.ei_mode = LCK_PR;
2591 /* An unlock request may or may not have any relation to
2592 * existing locks so we may not be able to pass a lock handle
2593 * via a normal ldlm_lock_cancel() request. The request may even
2594 * unlock a byte range in the middle of an existing lock. In
2595 * order to process an unlock request we need all of the same
2596 * information that is given with a normal read or write record
2597 * lock request. To avoid creating another ldlm unlock (cancel)
2598 * message we'll treat a LCK_NL flock request as an unlock. */
2599 einfo.ei_mode = LCK_NL;
2602 einfo.ei_mode = LCK_PW;
2605 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2620 flags = LDLM_FL_BLOCK_NOWAIT;
2626 flags = LDLM_FL_TEST_LOCK;
2627 /* Save the old mode so that if the mode in the lock changes we
2628 * can decrement the appropriate reader or writer refcount. */
2629 file_lock->fl_type = einfo.ei_mode;
2632 CERROR("unknown fcntl lock command: %d\n", cmd);
2636 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2637 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2638 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2640 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2641 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2642 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2643 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2644 #ifdef HAVE_F_OP_FLOCK
2645 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2646 !(flags & LDLM_FL_TEST_LOCK))
2647 posix_lock_file_wait(file, file_lock);
2653 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2660 int ll_have_md_lock(struct inode *inode, __u64 bits)
2662 struct lustre_handle lockh;
2663 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2671 fid = &ll_i2info(inode)->lli_fid;
2672 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2674 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2675 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2676 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2682 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2683 struct lustre_handle *lockh)
2685 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2691 fid = &ll_i2info(inode)->lli_fid;
2692 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2694 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2695 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2696 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2700 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2701 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2702 * and return success */
2704 /* This path cannot be hit for regular files unless in
2705 * case of obscure races, so no need to to validate
2707 if (!S_ISREG(inode->i_mode) &&
2708 !S_ISDIR(inode->i_mode))
2713 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2721 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2723 struct inode *inode = dentry->d_inode;
2724 struct ptlrpc_request *req = NULL;
2725 struct ll_sb_info *sbi;
2726 struct obd_export *exp;
2731 CERROR("REPORT THIS LINE TO PETER\n");
2734 sbi = ll_i2sbi(inode);
2736 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2737 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2739 exp = ll_i2mdexp(inode);
2741 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2742 struct lookup_intent oit = { .it_op = IT_GETATTR };
2743 struct md_op_data *op_data;
2745 /* Call getattr by fid, so do not provide name at all. */
2746 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2747 dentry->d_inode, NULL, 0, 0,
2748 LUSTRE_OPC_ANY, NULL);
2749 if (IS_ERR(op_data))
2750 RETURN(PTR_ERR(op_data));
2752 oit.it_flags |= O_CHECK_STALE;
2753 rc = md_intent_lock(exp, op_data, NULL, 0,
2754 /* we are not interested in name
2757 ll_md_blocking_ast, 0);
2758 ll_finish_md_op_data(op_data);
2759 oit.it_flags &= ~O_CHECK_STALE;
2761 rc = ll_inode_revalidate_fini(inode, rc);
2765 rc = ll_revalidate_it_finish(req, &oit, dentry);
2767 ll_intent_release(&oit);
2771 /* Unlinked? Unhash dentry, so it is not picked up later by
2772 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2773 here to preserve get_cwd functionality on 2.6.
2775 if (!dentry->d_inode->i_nlink) {
2776 spin_lock(&dcache_lock);
2777 ll_drop_dentry(dentry);
2778 spin_unlock(&dcache_lock);
2781 ll_lookup_finish_locks(&oit, dentry);
2782 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2783 MDS_INODELOCK_LOOKUP)) {
2784 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2785 obd_valid valid = OBD_MD_FLGETATTR;
2786 struct obd_capa *oc;
2789 if (S_ISREG(inode->i_mode)) {
2790 rc = ll_get_max_mdsize(sbi, &ealen);
2793 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2795 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2796 * capa for this inode. Because we only keep capas of dirs
2798 oc = ll_mdscapa_get(inode);
2799 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2803 rc = ll_inode_revalidate_fini(inode, rc);
2807 rc = ll_prep_inode(&inode, req, NULL);
2812 /* if object not yet allocated, don't validate size */
2813 if (ll_i2info(inode)->lli_smd == NULL)
2816 /* ll_glimpse_size will prefer locally cached writes if they extend
2818 rc = ll_glimpse_size(inode, 0);
2821 ptlrpc_req_finished(req);
2825 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2826 struct lookup_intent *it, struct kstat *stat)
2828 struct inode *inode = de->d_inode;
2831 res = ll_inode_revalidate_it(de, it);
2832 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2837 stat->dev = inode->i_sb->s_dev;
2838 stat->ino = inode->i_ino;
2839 stat->mode = inode->i_mode;
2840 stat->nlink = inode->i_nlink;
2841 stat->uid = inode->i_uid;
2842 stat->gid = inode->i_gid;
2843 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2844 stat->atime = inode->i_atime;
2845 stat->mtime = inode->i_mtime;
2846 stat->ctime = inode->i_ctime;
2847 #ifdef HAVE_INODE_BLKSIZE
2848 stat->blksize = inode->i_blksize;
2850 stat->blksize = 1 << inode->i_blkbits;
2853 ll_inode_size_lock(inode, 0);
2854 stat->size = i_size_read(inode);
2855 stat->blocks = inode->i_blocks;
2856 ll_inode_size_unlock(inode, 0);
2860 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2862 struct lookup_intent it = { .it_op = IT_GETATTR };
2864 return ll_getattr_it(mnt, de, &it, stat);
2868 int lustre_check_acl(struct inode *inode, int mask)
2870 #ifdef CONFIG_FS_POSIX_ACL
2871 struct ll_inode_info *lli = ll_i2info(inode);
2872 struct posix_acl *acl;
2876 spin_lock(&lli->lli_lock);
2877 acl = posix_acl_dup(lli->lli_posix_acl);
2878 spin_unlock(&lli->lli_lock);
2883 rc = posix_acl_permission(inode, acl, mask);
2884 posix_acl_release(acl);
2892 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2893 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2895 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2896 inode->i_ino, inode->i_generation, inode, mask);
2897 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2898 return lustre_check_remote_perm(inode, mask);
2900 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2901 return generic_permission(inode, mask, lustre_check_acl);
2904 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2906 int mode = inode->i_mode;
2909 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2910 inode->i_ino, inode->i_generation, inode, mask);
2912 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2913 return lustre_check_remote_perm(inode, mask);
2915 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2917 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2918 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2920 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2922 if (current->fsuid == inode->i_uid) {
2925 if (((mode >> 3) & mask & S_IRWXO) != mask)
2927 rc = lustre_check_acl(inode, mask);
2931 goto check_capabilities;
2935 if (in_group_p(inode->i_gid))
2938 if ((mode & mask & S_IRWXO) == mask)
2942 if (!(mask & MAY_EXEC) ||
2943 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2944 if (capable(CAP_DAC_OVERRIDE))
2947 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2948 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2955 /* -o localflock - only provides locally consistent flock locks */
2956 struct file_operations ll_file_operations = {
2957 .read = ll_file_read,
2958 .write = ll_file_write,
2959 .ioctl = ll_file_ioctl,
2960 .open = ll_file_open,
2961 .release = ll_file_release,
2962 .mmap = ll_file_mmap,
2963 .llseek = ll_file_seek,
2964 .sendfile = ll_file_sendfile,
2968 struct file_operations ll_file_operations_flock = {
2969 .read = ll_file_read,
2970 .write = ll_file_write,
2971 .ioctl = ll_file_ioctl,
2972 .open = ll_file_open,
2973 .release = ll_file_release,
2974 .mmap = ll_file_mmap,
2975 .llseek = ll_file_seek,
2976 .sendfile = ll_file_sendfile,
2978 #ifdef HAVE_F_OP_FLOCK
2979 .flock = ll_file_flock,
2981 .lock = ll_file_flock
2984 /* These are for -o noflock - to return ENOSYS on flock calls */
2985 struct file_operations ll_file_operations_noflock = {
2986 .read = ll_file_read,
2987 .write = ll_file_write,
2988 .ioctl = ll_file_ioctl,
2989 .open = ll_file_open,
2990 .release = ll_file_release,
2991 .mmap = ll_file_mmap,
2992 .llseek = ll_file_seek,
2993 .sendfile = ll_file_sendfile,
2995 #ifdef HAVE_F_OP_FLOCK
2996 .flock = ll_file_noflock,
2998 .lock = ll_file_noflock
3001 struct inode_operations ll_file_inode_operations = {
3002 #ifdef HAVE_VFS_INTENT_PATCHES
3003 .setattr_raw = ll_setattr_raw,
3005 .setattr = ll_setattr,
3006 .truncate = ll_truncate,
3007 .getattr = ll_getattr,
3008 .permission = ll_inode_permission,
3009 .setxattr = ll_setxattr,
3010 .getxattr = ll_getxattr,
3011 .listxattr = ll_listxattr,
3012 .removexattr = ll_removexattr,
3015 /* dynamic ioctl number support routins */
3016 static struct llioc_ctl_data {
3017 struct rw_semaphore ioc_sem;
3018 struct list_head ioc_head;
3020 __RWSEM_INITIALIZER(llioc.ioc_sem),
3021 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3026 struct list_head iocd_list;
3027 unsigned int iocd_size;
3028 llioc_callback_t iocd_cb;
3029 unsigned int iocd_count;
3030 unsigned int iocd_cmd[0];
3033 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3036 struct llioc_data *in_data = NULL;
3039 if (cb == NULL || cmd == NULL ||
3040 count > LLIOC_MAX_CMD || count < 0)
3043 size = sizeof(*in_data) + count * sizeof(unsigned int);
3044 OBD_ALLOC(in_data, size);
3045 if (in_data == NULL)
3048 memset(in_data, 0, sizeof(*in_data));
3049 in_data->iocd_size = size;
3050 in_data->iocd_cb = cb;
3051 in_data->iocd_count = count;
3052 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3054 down_write(&llioc.ioc_sem);
3055 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3056 up_write(&llioc.ioc_sem);
3061 void ll_iocontrol_unregister(void *magic)
3063 struct llioc_data *tmp;
3068 down_write(&llioc.ioc_sem);
3069 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3071 unsigned int size = tmp->iocd_size;
3073 list_del(&tmp->iocd_list);
3074 up_write(&llioc.ioc_sem);
3076 OBD_FREE(tmp, size);
3080 up_write(&llioc.ioc_sem);
3082 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3085 EXPORT_SYMBOL(ll_iocontrol_register);
3086 EXPORT_SYMBOL(ll_iocontrol_unregister);
3088 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3089 unsigned int cmd, unsigned long arg, int *rcp)
3091 enum llioc_iter ret = LLIOC_CONT;
3092 struct llioc_data *data;
3093 int rc = -EINVAL, i;
3095 down_read(&llioc.ioc_sem);
3096 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3097 for (i = 0; i < data->iocd_count; i++) {
3098 if (cmd != data->iocd_cmd[i])
3101 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3105 if (ret == LLIOC_STOP)
3108 up_read(&llioc.ioc_sem);