1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 #ifdef CONFIG_FS_POSIX_ACL
290 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
291 inode == inode->i_sb->s_root->d_inode) {
292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
295 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
296 fd->fd_flags &= ~LL_FILE_RMTACL;
297 rct_del(&sbi->ll_rct, cfs_curproc_pid());
298 et_search_free(&sbi->ll_et, cfs_curproc_pid());
303 if (inode->i_sb->s_root != file->f_dentry)
304 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
305 fd = LUSTRE_FPRIVATE(file);
308 /* The last ref on @file, maybe not the the owner pid of statahead.
309 * Different processes can open the same dir, "ll_opendir_key" means:
310 * it is me that should stop the statahead thread. */
311 if (lli->lli_opendir_key == fd)
312 ll_stop_statahead(inode, fd);
314 if (inode->i_sb->s_root == file->f_dentry) {
315 LUSTRE_FPRIVATE(file) = NULL;
316 ll_file_data_put(fd);
321 lov_test_and_clear_async_rc(lsm);
322 lli->lli_async_rc = 0;
324 rc = ll_md_close(sbi->ll_md_exp, inode, file);
328 static int ll_intent_file_open(struct file *file, void *lmm,
329 int lmmsize, struct lookup_intent *itp)
331 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
332 struct dentry *parent = file->f_dentry->d_parent;
333 const char *name = file->f_dentry->d_name.name;
334 const int len = file->f_dentry->d_name.len;
335 struct md_op_data *op_data;
336 struct ptlrpc_request *req;
343 /* Usually we come here only for NFSD, and we want open lock.
344 But we can also get here with pre 2.6.15 patchless kernels, and in
345 that case that lock is also ok */
346 /* We can also get here if there was cached open handle in revalidate_it
347 * but it disappeared while we were getting from there to ll_file_open.
348 * But this means this file was closed and immediatelly opened which
349 * makes a good candidate for using OPEN lock */
350 /* If lmmsize & lmm are not 0, we are just setting stripe info
351 * parameters. No need for the open lock */
352 if (!lmm && !lmmsize)
353 itp->it_flags |= MDS_OPEN_LOCK;
355 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
356 file->f_dentry->d_inode, name, len,
357 O_RDWR, LUSTRE_OPC_ANY, NULL);
359 RETURN(PTR_ERR(op_data));
361 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
362 0 /*unused */, &req, ll_md_blocking_ast, 0);
363 ll_finish_md_op_data(op_data);
365 /* reason for keep own exit path - don`t flood log
366 * with messages with -ESTALE errors.
368 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
369 it_open_error(DISP_OPEN_OPEN, itp))
371 ll_release_openhandle(file->f_dentry, itp);
375 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
376 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
377 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
381 if (itp->d.lustre.it_lock_mode)
382 md_set_lock_data(sbi->ll_md_exp,
383 &itp->d.lustre.it_lock_handle,
384 file->f_dentry->d_inode);
386 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
388 ptlrpc_req_finished(itp->d.lustre.it_data);
391 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
392 ll_intent_drop_lock(itp);
397 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
398 struct lookup_intent *it, struct obd_client_handle *och)
400 struct ptlrpc_request *req = it->d.lustre.it_data;
401 struct mdt_body *body;
405 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
406 LASSERT(body != NULL); /* reply already checked out */
408 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
409 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
410 och->och_fid = lli->lli_fid;
411 och->och_flags = it->it_flags;
412 lli->lli_ioepoch = body->ioepoch;
414 return md_set_open_replay_data(md_exp, och, req);
417 int ll_local_open(struct file *file, struct lookup_intent *it,
418 struct ll_file_data *fd, struct obd_client_handle *och)
420 struct inode *inode = file->f_dentry->d_inode;
421 struct ll_inode_info *lli = ll_i2info(inode);
424 LASSERT(!LUSTRE_FPRIVATE(file));
429 struct ptlrpc_request *req = it->d.lustre.it_data;
430 struct mdt_body *body;
433 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
438 if ((it->it_flags & FMODE_WRITE) &&
439 (body->valid & OBD_MD_FLSIZE))
440 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
441 lli->lli_ioepoch, PFID(&lli->lli_fid));
444 LUSTRE_FPRIVATE(file) = fd;
445 ll_readahead_init(inode, &fd->fd_ras);
446 fd->fd_omode = it->it_flags;
450 /* Open a file, and (for the very first open) create objects on the OSTs at
451 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
452 * creation or open until ll_lov_setstripe() ioctl is called. We grab
453 * lli_open_sem to ensure no other process will create objects, send the
454 * stripe MD to the MDS, or try to destroy the objects if that fails.
456 * If we already have the stripe MD locally then we don't request it in
457 * md_open(), by passing a lmm_size = 0.
459 * It is up to the application to ensure no other processes open this file
460 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
461 * used. We might be able to avoid races of that sort by getting lli_open_sem
462 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
463 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
465 int ll_file_open(struct inode *inode, struct file *file)
467 struct ll_inode_info *lli = ll_i2info(inode);
468 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
469 .it_flags = file->f_flags };
470 struct lov_stripe_md *lsm;
471 struct ptlrpc_request *req = NULL;
472 struct obd_client_handle **och_p;
474 struct ll_file_data *fd;
475 int rc = 0, opendir_set = 0;
478 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
479 inode->i_generation, inode, file->f_flags);
481 #ifdef HAVE_VFS_INTENT_PATCHES
484 it = file->private_data; /* XXX: compat macro */
485 file->private_data = NULL; /* prevent ll_local_open assertion */
488 fd = ll_file_data_get();
492 if (S_ISDIR(inode->i_mode)) {
493 spin_lock(&lli->lli_lock);
494 /* "lli->lli_opendir_pid != 0" means someone has set it.
495 * "lli->lli_sai != NULL" means the previous statahead has not
497 if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
499 lli->lli_opendir_pid = cfs_curproc_pid();
500 lli->lli_opendir_key = fd;
501 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
502 /* Two cases for this:
503 * (1) The same process open such directory many times.
504 * (2) The old process opened the directory, and exited
505 * before its children processes. Then new process
506 * with the same pid opens such directory before the
507 * old process's children processes exit.
508 * Change the owner to the latest one. */
510 lli->lli_opendir_key = fd;
512 spin_unlock(&lli->lli_lock);
515 if (inode->i_sb->s_root == file->f_dentry) {
516 LUSTRE_FPRIVATE(file) = fd;
520 if (!it || !it->d.lustre.it_disposition) {
521 /* Convert f_flags into access mode. We cannot use file->f_mode,
522 * because everything but O_ACCMODE mask was stripped from
524 if ((oit.it_flags + 1) & O_ACCMODE)
526 if (file->f_flags & O_TRUNC)
527 oit.it_flags |= FMODE_WRITE;
529 /* kernel only call f_op->open in dentry_open. filp_open calls
530 * dentry_open after call to open_namei that checks permissions.
531 * Only nfsd_open call dentry_open directly without checking
532 * permissions and because of that this code below is safe. */
533 if (oit.it_flags & FMODE_WRITE)
534 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
536 /* We do not want O_EXCL here, presumably we opened the file
537 * already? XXX - NFS implications? */
538 oit.it_flags &= ~O_EXCL;
544 /* Let's see if we have file open on MDS already. */
545 if (it->it_flags & FMODE_WRITE) {
546 och_p = &lli->lli_mds_write_och;
547 och_usecount = &lli->lli_open_fd_write_count;
548 } else if (it->it_flags & FMODE_EXEC) {
549 och_p = &lli->lli_mds_exec_och;
550 och_usecount = &lli->lli_open_fd_exec_count;
552 och_p = &lli->lli_mds_read_och;
553 och_usecount = &lli->lli_open_fd_read_count;
556 down(&lli->lli_och_sem);
557 if (*och_p) { /* Open handle is present */
558 if (it_disposition(it, DISP_OPEN_OPEN)) {
559 /* Well, there's extra open request that we do not need,
560 let's close it somehow. This will decref request. */
561 rc = it_open_error(DISP_OPEN_OPEN, it);
563 ll_file_data_put(fd);
564 GOTO(out_och_free, rc);
566 ll_release_openhandle(file->f_dentry, it);
567 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
572 rc = ll_local_open(file, it, fd, NULL);
574 up(&lli->lli_och_sem);
575 ll_file_data_put(fd);
579 LASSERT(*och_usecount == 0);
580 if (!it->d.lustre.it_disposition) {
581 /* We cannot just request lock handle now, new ELC code
582 means that one of other OPEN locks for this file
583 could be cancelled, and since blocking ast handler
584 would attempt to grab och_sem as well, that would
585 result in a deadlock */
586 up(&lli->lli_och_sem);
587 it->it_flags |= O_CHECK_STALE;
588 rc = ll_intent_file_open(file, NULL, 0, it);
589 it->it_flags &= ~O_CHECK_STALE;
591 ll_file_data_put(fd);
592 GOTO(out_openerr, rc);
595 /* Got some error? Release the request */
596 if (it->d.lustre.it_status < 0) {
597 req = it->d.lustre.it_data;
598 ptlrpc_req_finished(req);
600 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
601 &it->d.lustre.it_lock_handle,
602 file->f_dentry->d_inode);
605 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
607 ll_file_data_put(fd);
608 GOTO(out_och_free, rc = -ENOMEM);
611 req = it->d.lustre.it_data;
613 /* md_intent_lock() didn't get a request ref if there was an
614 * open error, so don't do cleanup on the request here
616 /* XXX (green): Should not we bail out on any error here, not
617 * just open error? */
618 rc = it_open_error(DISP_OPEN_OPEN, it);
620 ll_file_data_put(fd);
621 GOTO(out_och_free, rc);
624 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
625 rc = ll_local_open(file, it, fd, *och_p);
627 up(&lli->lli_och_sem);
628 ll_file_data_put(fd);
629 GOTO(out_och_free, rc);
632 up(&lli->lli_och_sem);
634 /* Must do this outside lli_och_sem lock to prevent deadlock where
635 different kind of OPEN lock for this same inode gets cancelled
636 by ldlm_cancel_lru */
637 if (!S_ISREG(inode->i_mode))
644 if (file->f_flags & O_LOV_DELAY_CREATE ||
645 !(file->f_mode & FMODE_WRITE)) {
646 CDEBUG(D_INODE, "object creation was delayed\n");
650 file->f_flags &= ~O_LOV_DELAY_CREATE;
653 ptlrpc_req_finished(req);
655 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
660 *och_p = NULL; /* OBD_FREE writes some magic there */
663 up(&lli->lli_och_sem);
665 if (opendir_set == 1) {
666 lli->lli_opendir_key = NULL;
667 lli->lli_opendir_pid = 0;
668 } else if (unlikely(opendir_set == 2)) {
669 ll_stop_statahead(inode, fd);
676 /* Fills the obdo with the attributes for the inode defined by lsm */
677 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
679 struct ptlrpc_request_set *set;
680 struct ll_inode_info *lli = ll_i2info(inode);
681 struct lov_stripe_md *lsm = lli->lli_smd;
683 struct obd_info oinfo = { { { 0 } } };
687 LASSERT(lsm != NULL);
691 oinfo.oi_oa->o_id = lsm->lsm_object_id;
692 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
693 oinfo.oi_oa->o_mode = S_IFREG;
694 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
695 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
696 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
697 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
699 oinfo.oi_capa = ll_mdscapa_get(inode);
701 set = ptlrpc_prep_set();
703 CERROR("can't allocate ptlrpc set\n");
706 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
708 rc = ptlrpc_set_wait(set);
709 ptlrpc_set_destroy(set);
711 capa_put(oinfo.oi_capa);
715 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
716 OBD_MD_FLATIME | OBD_MD_FLMTIME |
717 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
719 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
720 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
721 lli->lli_smd->lsm_object_id, i_size_read(inode),
722 (unsigned long long)inode->i_blocks,
723 (unsigned long)ll_inode_blksize(inode));
727 static inline void ll_remove_suid(struct inode *inode)
731 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
732 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
734 /* was any of the uid bits set? */
735 mode &= inode->i_mode;
736 if (mode && !capable(CAP_FSETID)) {
737 inode->i_mode &= ~mode;
738 // XXX careful here - we cannot change the size
742 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
744 struct ll_inode_info *lli = ll_i2info(inode);
745 struct lov_stripe_md *lsm = lli->lli_smd;
746 struct obd_export *exp = ll_i2dtexp(inode);
749 struct ldlm_lock *lock;
750 struct lov_stripe_md *lsm;
751 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm };
752 __u32 stripe, vallen = sizeof(stripe);
756 if (lsm->lsm_stripe_count == 1)
757 GOTO(check, stripe = 0);
759 /* get our offset in the lov */
760 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
762 CERROR("obd_get_info: rc = %d\n", rc);
765 LASSERT(stripe < lsm->lsm_stripe_count);
768 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
769 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
770 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
771 lsm->lsm_oinfo[stripe]->loi_id,
772 lsm->lsm_oinfo[stripe]->loi_gr);
773 RETURN(-ELDLM_NO_LOCK_DATA);
779 /* Get extra page reference to ensure it is not going away */
780 void ll_pin_extent_cb(void *data)
782 struct page *page = data;
784 page_cache_get(page);
789 /* Flush the page from page cache for an extent as its canceled.
790 * Page to remove is delivered as @data.
792 * No one can dirty the extent until we've finished our work and they cannot
793 * enqueue another lock. The DLM protects us from ll_file_read/write here,
794 * but other kernel actors could have pages locked.
796 * If @discard is set, there is no need to write the page if it is dirty.
798 * Called with the DLM lock held. */
799 int ll_page_removal_cb(void *data, int discard)
802 struct page *page = data;
803 struct address_space *mapping;
807 /* We have page reference already from ll_pin_page */
810 /* Already truncated by somebody */
813 mapping = page->mapping;
815 ll_teardown_mmaps(mapping,
816 (__u64)page->index << PAGE_CACHE_SHIFT,
817 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
819 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
821 if (!discard && clear_page_dirty_for_io(page)) {
822 LASSERT(page->mapping);
823 rc = ll_call_writepage(page->mapping->host, page);
824 /* either waiting for io to complete or reacquiring
825 * the lock that the failed writepage released */
827 wait_on_page_writeback(page);
829 CERROR("writepage inode %lu(%p) of page %p "
830 "failed: %d\n", mapping->host->i_ino,
831 mapping->host, page, rc);
833 set_bit(AS_ENOSPC, &mapping->flags);
835 set_bit(AS_EIO, &mapping->flags);
837 set_bit(AS_EIO, &mapping->flags);
839 if (page->mapping != NULL) {
840 struct ll_async_page *llap = llap_cast_private(page);
841 /* checking again to account for writeback's lock_page() */
842 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
844 ll_ra_accounting(llap, page->mapping);
845 ll_truncate_complete_page(page);
849 LASSERT(!PageWriteback(page));
851 page_cache_release(page);
856 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
857 void *data, int flag)
860 struct ll_inode_info *lli;
861 struct lov_stripe_md *lsm;
867 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
868 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
872 inode = ll_inode_from_lock(lock);
875 lli = ll_i2info(inode);
878 if (lli->lli_smd == NULL)
882 stripe = ll_lock_to_stripe_offset(inode, lock);
886 lov_stripe_lock(lsm);
887 lock_res_and_lock(lock);
888 kms = ldlm_extent_shift_kms(lock,
889 lsm->lsm_oinfo[stripe]->loi_kms);
891 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
892 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
893 lsm->lsm_oinfo[stripe]->loi_kms, kms);
894 lsm->lsm_oinfo[stripe]->loi_kms = kms;
895 unlock_res_and_lock(lock);
896 lov_stripe_unlock(lsm);
897 ll_queue_done_writing(inode, 0);
906 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
908 /* XXX ALLOCATE - 160 bytes */
909 struct inode *inode = ll_inode_from_lock(lock);
910 struct ll_inode_info *lli = ll_i2info(inode);
911 struct lustre_handle lockh = { 0 };
916 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
917 LDLM_FL_BLOCK_CONV)) {
918 LBUG(); /* not expecting any blocked async locks yet */
919 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
921 ldlm_lock_dump(D_OTHER, lock, 0);
922 ldlm_reprocess_all(lock->l_resource);
926 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
928 stripe = ll_lock_to_stripe_offset(inode, lock);
932 if (lock->l_lvb_len) {
933 struct lov_stripe_md *lsm = lli->lli_smd;
935 lvb = lock->l_lvb_data;
936 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
938 lock_res_and_lock(lock);
939 ll_inode_size_lock(inode, 1);
940 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
941 kms = ldlm_extent_shift_kms(NULL, kms);
942 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
943 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
944 lsm->lsm_oinfo[stripe].loi_kms, kms);
945 lsm->lsm_oinfo[stripe].loi_kms = kms;
946 ll_inode_size_unlock(inode, 1);
947 unlock_res_and_lock(lock);
952 wake_up(&lock->l_waitq);
954 ldlm_lock2handle(lock, &lockh);
955 ldlm_lock_decref(&lockh, LCK_PR);
960 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
962 struct ptlrpc_request *req = reqp;
963 struct inode *inode = ll_inode_from_lock(lock);
964 struct ll_inode_info *lli;
965 struct lov_stripe_md *lsm;
971 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
972 lli = ll_i2info(inode);
974 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
977 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
979 /* First, find out which stripe index this lock corresponds to. */
980 stripe = ll_lock_to_stripe_offset(inode, lock);
982 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
984 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
985 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
987 rc = req_capsule_server_pack(&req->rq_pill);
989 CERROR("lustre_pack_reply: %d\n", rc);
993 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
994 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
995 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
996 lvb->lvb_atime = LTIME_S(inode->i_atime);
997 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
999 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1000 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1001 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1002 lvb->lvb_atime, lvb->lvb_ctime);
1007 /* These errors are normal races, so we don't want to fill the console
1008 * with messages by calling ptlrpc_error() */
1009 if (rc == -ELDLM_NO_LOCK_DATA)
1010 lustre_pack_reply(req, 1, NULL, NULL);
1012 req->rq_status = rc;
1016 static int ll_merge_lvb(struct inode *inode)
1018 struct ll_inode_info *lli = ll_i2info(inode);
1019 struct ll_sb_info *sbi = ll_i2sbi(inode);
1025 ll_inode_size_lock(inode, 1);
1026 inode_init_lvb(inode, &lvb);
1027 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1028 i_size_write(inode, lvb.lvb_size);
1029 inode->i_blocks = lvb.lvb_blocks;
1031 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1032 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1033 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1034 ll_inode_size_unlock(inode, 1);
1039 int ll_local_size(struct inode *inode)
1041 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1042 struct ll_inode_info *lli = ll_i2info(inode);
1043 struct ll_sb_info *sbi = ll_i2sbi(inode);
1044 struct lustre_handle lockh = { 0 };
1049 if (lli->lli_smd->lsm_stripe_count == 0)
1052 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1053 &policy, LCK_PR, &flags, inode, &lockh);
1059 rc = ll_merge_lvb(inode);
1060 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1064 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1067 struct lustre_handle lockh = { 0 };
1068 struct ldlm_enqueue_info einfo = { 0 };
1069 struct obd_info oinfo = { { { 0 } } };
1075 einfo.ei_type = LDLM_EXTENT;
1076 einfo.ei_mode = LCK_PR;
1077 einfo.ei_cb_bl = osc_extent_blocking_cb;
1078 einfo.ei_cb_cp = ldlm_completion_ast;
1079 einfo.ei_cb_gl = ll_glimpse_callback;
1080 einfo.ei_cbdata = NULL;
1082 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1083 oinfo.oi_lockh = &lockh;
1085 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1087 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1091 CERROR("obd_enqueue returned rc %d, "
1092 "returning -EIO\n", rc);
1093 RETURN(rc > 0 ? -EIO : rc);
1096 lov_stripe_lock(lsm);
1097 memset(&lvb, 0, sizeof(lvb));
1098 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1099 st->st_size = lvb.lvb_size;
1100 st->st_blocks = lvb.lvb_blocks;
1101 st->st_mtime = lvb.lvb_mtime;
1102 st->st_atime = lvb.lvb_atime;
1103 st->st_ctime = lvb.lvb_ctime;
1104 lov_stripe_unlock(lsm);
1109 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1110 * file (because it prefers KMS over RSS when larger) */
1111 int ll_glimpse_size(struct inode *inode, int ast_flags)
1113 struct ll_inode_info *lli = ll_i2info(inode);
1114 struct ll_sb_info *sbi = ll_i2sbi(inode);
1115 struct lustre_handle lockh = { 0 };
1116 struct ldlm_enqueue_info einfo = { 0 };
1117 struct obd_info oinfo = { { { 0 } } };
1121 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1124 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1126 if (!lli->lli_smd) {
1127 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1131 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1132 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1133 * won't revoke any conflicting DLM locks held. Instead,
1134 * ll_glimpse_callback() will be called on each client
1135 * holding a DLM lock against this file, and resulting size
1136 * will be returned for each stripe. DLM lock on [0, EOF] is
1137 * acquired only if there were no conflicting locks. */
1138 einfo.ei_type = LDLM_EXTENT;
1139 einfo.ei_mode = LCK_PR;
1140 einfo.ei_cb_bl = osc_extent_blocking_cb;
1141 einfo.ei_cb_cp = ldlm_completion_ast;
1142 einfo.ei_cb_gl = ll_glimpse_callback;
1143 einfo.ei_cbdata = inode;
1145 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1146 oinfo.oi_lockh = &lockh;
1147 oinfo.oi_md = lli->lli_smd;
1148 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1150 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1154 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1155 RETURN(rc > 0 ? -EIO : rc);
1158 rc = ll_merge_lvb(inode);
1160 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1161 i_size_read(inode), (unsigned long long)inode->i_blocks);
1166 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1167 struct lov_stripe_md *lsm, int mode,
1168 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1171 struct ll_sb_info *sbi = ll_i2sbi(inode);
1173 struct ldlm_enqueue_info einfo = { 0 };
1174 struct obd_info oinfo = { { { 0 } } };
1178 LASSERT(!lustre_handle_is_used(lockh));
1179 LASSERT(lsm != NULL);
1181 /* don't drop the mmapped file to LRU */
1182 if (mapping_mapped(inode->i_mapping))
1183 ast_flags |= LDLM_FL_NO_LRU;
1185 /* XXX phil: can we do this? won't it screw the file size up? */
1186 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1187 (sbi->ll_flags & LL_SBI_NOLCK))
1190 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1191 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1193 einfo.ei_type = LDLM_EXTENT;
1194 einfo.ei_mode = mode;
1195 einfo.ei_cb_bl = osc_extent_blocking_cb;
1196 einfo.ei_cb_cp = ldlm_completion_ast;
1197 einfo.ei_cb_gl = ll_glimpse_callback;
1198 einfo.ei_cbdata = inode;
1200 oinfo.oi_policy = *policy;
1201 oinfo.oi_lockh = lockh;
1203 oinfo.oi_flags = ast_flags;
1205 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1206 *policy = oinfo.oi_policy;
1210 ll_inode_size_lock(inode, 1);
1211 inode_init_lvb(inode, &lvb);
1212 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1214 if (policy->l_extent.start == 0 &&
1215 policy->l_extent.end == OBD_OBJECT_EOF) {
1216 /* vmtruncate()->ll_truncate() first sets the i_size and then
1217 * the kms under both a DLM lock and the
1218 * ll_inode_size_lock(). If we don't get the
1219 * ll_inode_size_lock() here we can match the DLM lock and
1220 * reset i_size from the kms before the truncating path has
1221 * updated the kms. generic_file_write can then trust the
1222 * stale i_size when doing appending writes and effectively
1223 * cancel the result of the truncate. Getting the
1224 * ll_inode_size_lock() after the enqueue maintains the DLM
1225 * -> ll_inode_size_lock() acquiring order. */
1226 i_size_write(inode, lvb.lvb_size);
1227 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1228 inode->i_ino, i_size_read(inode));
1232 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1233 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1234 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1236 ll_inode_size_unlock(inode, 1);
1241 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1242 struct lov_stripe_md *lsm, int mode,
1243 struct lustre_handle *lockh)
1245 struct ll_sb_info *sbi = ll_i2sbi(inode);
1249 /* XXX phil: can we do this? won't it screw the file size up? */
1250 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1251 (sbi->ll_flags & LL_SBI_NOLCK))
1254 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1259 static void ll_set_file_contended(struct inode *inode)
1261 struct ll_inode_info *lli = ll_i2info(inode);
1262 cfs_time_t now = cfs_time_current();
1264 spin_lock(&lli->lli_lock);
1265 lli->lli_contention_time = now;
1266 lli->lli_flags |= LLIF_CONTENDED;
1267 spin_unlock(&lli->lli_lock);
1270 void ll_clear_file_contended(struct inode *inode)
1272 struct ll_inode_info *lli = ll_i2info(inode);
1274 spin_lock(&lli->lli_lock);
1275 lli->lli_flags &= ~LLIF_CONTENDED;
1276 spin_unlock(&lli->lli_lock);
1279 static int ll_is_file_contended(struct file *file)
1281 struct inode *inode = file->f_dentry->d_inode;
1282 struct ll_inode_info *lli = ll_i2info(inode);
1283 struct ll_sb_info *sbi = ll_i2sbi(inode);
1284 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1287 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1288 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1289 " osc connect flags = 0x"LPX64"\n",
1290 sbi->ll_lco.lco_flags);
1293 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1295 if (lli->lli_flags & LLIF_CONTENDED) {
1296 cfs_time_t cur_time = cfs_time_current();
1297 cfs_time_t retry_time;
1299 retry_time = cfs_time_add(
1300 lli->lli_contention_time,
1301 cfs_time_seconds(sbi->ll_contention_time));
1302 if (cfs_time_after(cur_time, retry_time)) {
1303 ll_clear_file_contended(inode);
1311 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1312 const char *buf, size_t count,
1313 loff_t start, loff_t end, int rw)
1316 int tree_locked = 0;
1318 struct inode * inode = file->f_dentry->d_inode;
1321 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1323 if (append || !ll_is_file_contended(file)) {
1324 struct ll_lock_tree_node *node;
1327 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1328 if (file->f_flags & O_NONBLOCK)
1329 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1330 node = ll_node_from_inode(inode, start, end,
1331 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1336 tree->lt_fd = LUSTRE_FPRIVATE(file);
1337 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1340 else if (rc == -EUSERS)
1341 ll_set_file_contended(inode);
1345 RETURN(tree_locked);
1351 * Checks if requested extent lock is compatible with a lock under a page.
1353 * Checks if the lock under \a page is compatible with a read or write lock
1354 * (specified by \a rw) for an extent [\a start , \a end].
1356 * \param page the page under which lock is considered
1357 * \param rw OBD_BRW_READ if requested for reading,
1358 * OBD_BRW_WRITE if requested for writing
1359 * \param start start of the requested extent
1360 * \param end end of the requested extent
1361 * \param cookie transparent parameter for passing locking context
1363 * \post result == 1, *cookie == context, appropriate lock is referenced or
1366 * \retval 1 owned lock is reused for the request
1367 * \retval 0 no lock reused for the request
1369 * \see ll_release_short_lock
1371 static int ll_reget_short_lock(struct page *page, int rw,
1372 obd_off start, obd_off end,
1375 struct ll_async_page *llap;
1376 struct obd_export *exp;
1377 struct inode *inode = page->mapping->host;
1381 exp = ll_i2dtexp(inode);
1385 llap = llap_cast_private(page);
1389 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1390 &llap->llap_cookie, rw, start, end,
1395 * Releases a reference to a lock taken in a "fast" way.
1397 * Releases a read or a write (specified by \a rw) lock
1398 * referenced by \a cookie.
1400 * \param inode inode to which data belong
1401 * \param end end of the locked extent
1402 * \param rw OBD_BRW_READ if requested for reading,
1403 * OBD_BRW_WRITE if requested for writing
1404 * \param cookie transparent parameter for passing locking context
1406 * \post appropriate lock is dereferenced
1408 * \see ll_reget_short_lock
1410 static void ll_release_short_lock(struct inode *inode, obd_off end,
1411 void *cookie, int rw)
1413 struct obd_export *exp;
1416 exp = ll_i2dtexp(inode);
1420 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1423 CERROR("unlock failed (%d)\n", rc);
1427 * Checks if requested extent lock is compatible
1428 * with a lock under a page in page cache.
1430 * Checks if a lock under some \a page is compatible with a read or write lock
1431 * (specified by \a rw) for an extent [\a start , \a end].
1433 * \param file the file under which lock is considered
1434 * \param rw OBD_BRW_READ if requested for reading,
1435 * OBD_BRW_WRITE if requested for writing
1436 * \param ppos start of the requested extent
1437 * \param end end of the requested extent
1438 * \param cookie transparent parameter for passing locking context
1439 * \param buf userspace buffer for the data
1441 * \post result == 1, *cookie == context, appropriate lock is referenced
1444 * \retval 1 owned lock is reused for the request
1445 * \retval 0 no lock reused for the request
1447 * \see ll_file_put_fast_lock
1449 static inline int ll_file_get_fast_lock(struct file *file,
1450 obd_off ppos, obd_off end,
1451 char *buf, void **cookie, int rw)
1458 if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
1459 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1460 ppos >> CFS_PAGE_SHIFT);
1462 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1466 page_cache_release(page);
1474 * Releases a reference to a lock taken in a "fast" way.
1476 * Releases a read or a write (specified by \a rw) lock
1477 * referenced by \a cookie.
1479 * \param inode inode to which data belong
1480 * \param end end of the locked extent
1481 * \param rw OBD_BRW_READ if requested for reading,
1482 * OBD_BRW_WRITE if requested for writing
1483 * \param cookie transparent parameter for passing locking context
1485 * \post appropriate lock is dereferenced
1487 * \see ll_file_get_fast_lock
1489 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1490 void *cookie, int rw)
1492 ll_release_short_lock(inode, end, cookie, rw);
1495 enum ll_lock_style {
1496 LL_LOCK_STYLE_NOLOCK = 0,
1497 LL_LOCK_STYLE_FASTLOCK = 1,
1498 LL_LOCK_STYLE_TREELOCK = 2
1502 * Checks if requested extent lock is compatible with a lock
1503 * under a page cache page.
1505 * Checks if the lock under \a page is compatible with a read or write lock
1506 * (specified by \a rw) for an extent [\a start , \a end].
1508 * \param file file under which I/O is processed
1509 * \param rw OBD_BRW_READ if requested for reading,
1510 * OBD_BRW_WRITE if requested for writing
1511 * \param ppos start of the requested extent
1512 * \param end end of the requested extent
1513 * \param cookie transparent parameter for passing locking context
1514 * (only used with LL_LOCK_STYLE_FASTLOCK)
1515 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1516 * \param buf userspace buffer for the data
1518 * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
1519 * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
1520 * \retval LL_LOCK_STYLE_NOLOCK got no lock
1522 * \see ll_file_put_lock
1524 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1525 obd_off end, char *buf, void **cookie,
1526 struct ll_lock_tree *tree, int rw)
1532 if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
1533 RETURN(LL_LOCK_STYLE_FASTLOCK);
1535 rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
1536 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1539 RETURN(LL_LOCK_STYLE_TREELOCK);
1541 RETURN(LL_LOCK_STYLE_NOLOCK);
1544 /* an error happened if we reached this point, rc = -errno here */
1549 * Drops the lock taken by ll_file_get_lock.
1551 * Releases a read or a write (specified by \a rw) lock
1552 * referenced by \a tree or \a cookie.
1554 * \param inode inode to which data belong
1555 * \param end end of the locked extent
1556 * \param lockstyle facility through which the lock was taken
1557 * \param rw OBD_BRW_READ if requested for reading,
1558 * OBD_BRW_WRITE if requested for writing
1559 * \param cookie transparent parameter for passing locking context
1560 * (only used with LL_LOCK_STYLE_FASTLOCK)
1561 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1563 * \post appropriate lock is dereferenced
1565 * \see ll_file_get_lock
1567 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1568 enum ll_lock_style lock_style,
1569 void *cookie, struct ll_lock_tree *tree,
1573 switch (lock_style) {
1574 case LL_LOCK_STYLE_TREELOCK:
1575 ll_tree_unlock(tree);
1577 case LL_LOCK_STYLE_FASTLOCK:
1578 ll_file_put_fast_lock(inode, end, cookie, rw);
1581 CERROR("invalid locking style (%d)\n", lock_style);
1585 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1588 struct inode *inode = file->f_dentry->d_inode;
1589 struct ll_inode_info *lli = ll_i2info(inode);
1590 struct lov_stripe_md *lsm = lli->lli_smd;
1591 struct ll_sb_info *sbi = ll_i2sbi(inode);
1592 struct ll_lock_tree tree;
1594 struct ll_ra_read bead;
1597 ssize_t retval, chunk, sum = 0;
1603 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1604 inode->i_ino, inode->i_generation, inode, count, *ppos);
1605 /* "If nbyte is 0, read() will return 0 and have no other results."
1606 * -- Single Unix Spec */
1610 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1613 /* Read on file with no objects should return zero-filled
1614 * buffers up to file size (we can get non-zero sizes with
1615 * mknod + truncate, then opening file for read. This is a
1616 * common pattern in NFS case, it seems). Bug 6243 */
1618 /* Since there are no objects on OSTs, we have nothing to get
1619 * lock on and so we are forced to access inode->i_size
1622 /* Read beyond end of file */
1623 if (*ppos >= i_size_read(inode))
1626 if (count > i_size_read(inode) - *ppos)
1627 count = i_size_read(inode) - *ppos;
1628 /* Make sure to correctly adjust the file pos pointer for
1630 notzeroed = clear_user(buf, count);
1638 if (sbi->ll_max_rw_chunk != 0) {
1639 /* first, let's know the end of the current stripe */
1641 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1643 /* correct, the end is beyond the request */
1644 if (end > *ppos + count - 1)
1645 end = *ppos + count - 1;
1647 /* and chunk shouldn't be too large even if striping is wide */
1648 if (end - *ppos > sbi->ll_max_rw_chunk)
1649 end = *ppos + sbi->ll_max_rw_chunk - 1;
1651 end = *ppos + count - 1;
1654 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1655 buf, &cookie, &tree, OBD_BRW_READ);
1657 GOTO(out, retval = lock_style);
1659 ll_inode_size_lock(inode, 1);
1661 * Consistency guarantees: following possibilities exist for the
1662 * relation between region being read and real file size at this
1665 * (A): the region is completely inside of the file;
1667 * (B-x): x bytes of region are inside of the file, the rest is
1670 * (C): the region is completely outside of the file.
1672 * This classification is stable under DLM lock acquired by
1673 * ll_tree_lock() above, because to change class, other client has to
1674 * take DLM lock conflicting with our lock. Also, any updates to
1675 * ->i_size by other threads on this client are serialized by
1676 * ll_inode_size_lock(). This guarantees that short reads are handled
1677 * correctly in the face of concurrent writes and truncates.
1679 inode_init_lvb(inode, &lvb);
1680 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1682 if (*ppos + count - 1 > kms) {
1683 /* A glimpse is necessary to determine whether we return a
1684 * short read (B) or some zeroes at the end of the buffer (C) */
1685 ll_inode_size_unlock(inode, 1);
1686 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1688 if (lock_style != LL_LOCK_STYLE_NOLOCK)
1689 ll_file_put_lock(inode, end, lock_style,
1690 cookie, &tree, OBD_BRW_READ);
1694 /* region is within kms and, hence, within real file size (A).
1695 * We need to increase i_size to cover the read region so that
1696 * generic_file_read() will do its job, but that doesn't mean
1697 * the kms size is _correct_, it is only the _minimum_ size.
1698 * If someone does a stat they will get the correct size which
1699 * will always be >= the kms value here. b=11081 */
1700 if (i_size_read(inode) < kms)
1701 i_size_write(inode, kms);
1702 ll_inode_size_unlock(inode, 1);
1705 chunk = end - *ppos + 1;
1706 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1707 inode->i_ino, chunk, *ppos, i_size_read(inode));
1709 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1710 /* turn off the kernel's read-ahead */
1711 file->f_ra.ra_pages = 0;
1713 /* initialize read-ahead window once per syscall */
1716 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1717 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1718 ll_ra_read_in(file, &bead);
1722 file_accessed(file);
1723 retval = generic_file_read(file, buf, chunk, ppos);
1724 ll_file_put_lock(inode, end, lock_style, cookie, &tree,
1727 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1730 ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1736 if (retval == chunk && count > 0)
1742 ll_ra_read_ex(file, &bead);
1743 retval = (sum > 0) ? sum : retval;
1748 * Write to a file (through the page cache).
1750 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1753 struct inode *inode = file->f_dentry->d_inode;
1754 struct ll_sb_info *sbi = ll_i2sbi(inode);
1755 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1756 struct ll_lock_tree tree;
1757 loff_t maxbytes = ll_file_maxbytes(inode);
1758 loff_t lock_start, lock_end, end;
1759 ssize_t retval, chunk, sum = 0;
1763 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1764 inode->i_ino, inode->i_generation, inode, count, *ppos);
1766 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1768 /* POSIX, but surprised the VFS doesn't check this already */
1772 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1773 * called on the file, don't fail the below assertion (bug 2388). */
1774 if (file->f_flags & O_LOV_DELAY_CREATE &&
1775 ll_i2info(inode)->lli_smd == NULL)
1778 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1780 down(&ll_i2info(inode)->lli_write_sem);
1783 chunk = 0; /* just to fix gcc's warning */
1784 end = *ppos + count - 1;
1786 if (file->f_flags & O_APPEND) {
1788 lock_end = OBD_OBJECT_EOF;
1789 } else if (sbi->ll_max_rw_chunk != 0) {
1790 /* first, let's know the end of the current stripe */
1792 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1795 /* correct, the end is beyond the request */
1796 if (end > *ppos + count - 1)
1797 end = *ppos + count - 1;
1799 /* and chunk shouldn't be too large even if striping is wide */
1800 if (end - *ppos > sbi->ll_max_rw_chunk)
1801 end = *ppos + sbi->ll_max_rw_chunk - 1;
1806 lock_end = *ppos + count - 1;
1809 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1810 lock_start, lock_end, OBD_BRW_WRITE);
1811 if (tree_locked < 0)
1812 GOTO(out, retval = tree_locked);
1814 /* This is ok, g_f_w will overwrite this under i_sem if it races
1815 * with a local truncate, it just makes our maxbyte checking easier.
1816 * The i_size value gets updated in ll_extent_lock() as a consequence
1817 * of the [0,EOF] extent lock we requested above. */
1818 if (file->f_flags & O_APPEND) {
1819 *ppos = i_size_read(inode);
1820 end = *ppos + count - 1;
1823 if (*ppos >= maxbytes) {
1824 send_sig(SIGXFSZ, current, 0);
1825 GOTO(out_unlock, retval = -EFBIG);
1827 if (end > maxbytes - 1)
1830 /* generic_file_write handles O_APPEND after getting i_mutex */
1831 chunk = end - *ppos + 1;
1832 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1833 inode->i_ino, chunk, *ppos);
1835 retval = generic_file_write(file, buf, chunk, ppos);
1837 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1839 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1843 ll_tree_unlock(&tree);
1850 if (retval == chunk && count > 0)
1854 up(&ll_i2info(inode)->lli_write_sem);
1856 retval = (sum > 0) ? sum : retval;
1857 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1858 retval > 0 ? retval : 0);
1863 * Send file content (through pagecache) somewhere with helper
1865 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1866 read_actor_t actor, void *target)
1868 struct inode *inode = in_file->f_dentry->d_inode;
1869 struct ll_inode_info *lli = ll_i2info(inode);
1870 struct lov_stripe_md *lsm = lli->lli_smd;
1871 struct ll_lock_tree tree;
1872 struct ll_lock_tree_node *node;
1874 struct ll_ra_read bead;
1879 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1880 inode->i_ino, inode->i_generation, inode, count, *ppos);
1882 /* "If nbyte is 0, read() will return 0 and have no other results."
1883 * -- Single Unix Spec */
1887 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1888 /* turn off the kernel's read-ahead */
1889 in_file->f_ra.ra_pages = 0;
1891 /* File with no objects, nothing to lock */
1893 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1895 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1897 RETURN(PTR_ERR(node));
1899 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1900 rc = ll_tree_lock(&tree, node, NULL, count,
1901 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1905 ll_clear_file_contended(inode);
1906 ll_inode_size_lock(inode, 1);
1908 * Consistency guarantees: following possibilities exist for the
1909 * relation between region being read and real file size at this
1912 * (A): the region is completely inside of the file;
1914 * (B-x): x bytes of region are inside of the file, the rest is
1917 * (C): the region is completely outside of the file.
1919 * This classification is stable under DLM lock acquired by
1920 * ll_tree_lock() above, because to change class, other client has to
1921 * take DLM lock conflicting with our lock. Also, any updates to
1922 * ->i_size by other threads on this client are serialized by
1923 * ll_inode_size_lock(). This guarantees that short reads are handled
1924 * correctly in the face of concurrent writes and truncates.
1926 inode_init_lvb(inode, &lvb);
1927 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1929 if (*ppos + count - 1 > kms) {
1930 /* A glimpse is necessary to determine whether we return a
1931 * short read (B) or some zeroes at the end of the buffer (C) */
1932 ll_inode_size_unlock(inode, 1);
1933 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1937 /* region is within kms and, hence, within real file size (A) */
1938 i_size_write(inode, kms);
1939 ll_inode_size_unlock(inode, 1);
1942 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1943 inode->i_ino, count, *ppos, i_size_read(inode));
1945 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1946 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1947 ll_ra_read_in(in_file, &bead);
1949 file_accessed(in_file);
1950 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1951 ll_ra_read_ex(in_file, &bead);
1954 ll_tree_unlock(&tree);
1958 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1961 struct ll_inode_info *lli = ll_i2info(inode);
1962 struct obd_export *exp = ll_i2dtexp(inode);
1963 struct ll_recreate_obj ucreatp;
1964 struct obd_trans_info oti = { 0 };
1965 struct obdo *oa = NULL;
1968 struct lov_stripe_md *lsm, *lsm2;
1971 if (!capable (CAP_SYS_ADMIN))
1974 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1975 sizeof(struct ll_recreate_obj));
1983 down(&lli->lli_size_sem);
1986 GOTO(out, rc = -ENOENT);
1987 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1988 (lsm->lsm_stripe_count));
1990 OBD_ALLOC(lsm2, lsm_size);
1992 GOTO(out, rc = -ENOMEM);
1994 oa->o_id = ucreatp.lrc_id;
1995 oa->o_gr = ucreatp.lrc_group;
1996 oa->o_nlink = ucreatp.lrc_ost_idx;
1997 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1998 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1999 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2000 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2002 memcpy(lsm2, lsm, lsm_size);
2003 rc = obd_create(exp, oa, &lsm2, &oti);
2005 OBD_FREE(lsm2, lsm_size);
2008 up(&lli->lli_size_sem);
2013 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2014 int flags, struct lov_user_md *lum, int lum_size)
2016 struct ll_inode_info *lli = ll_i2info(inode);
2017 struct lov_stripe_md *lsm;
2018 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2022 down(&lli->lli_size_sem);
2025 up(&lli->lli_size_sem);
2026 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2031 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2034 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2035 GOTO(out_req_free, rc = -ENOENT);
2036 rc = oit.d.lustre.it_status;
2038 GOTO(out_req_free, rc);
2040 ll_release_openhandle(file->f_dentry, &oit);
2043 up(&lli->lli_size_sem);
2044 ll_intent_release(&oit);
2047 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2051 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2052 struct lov_mds_md **lmmp, int *lmm_size,
2053 struct ptlrpc_request **request)
2055 struct ll_sb_info *sbi = ll_i2sbi(inode);
2056 struct mdt_body *body;
2057 struct lov_mds_md *lmm = NULL;
2058 struct ptlrpc_request *req = NULL;
2059 struct obd_capa *oc;
2062 rc = ll_get_max_mdsize(sbi, &lmmsize);
2066 oc = ll_mdscapa_get(inode);
2067 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
2068 oc, filename, strlen(filename) + 1,
2069 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
2070 ll_i2suppgid(inode), &req);
2073 CDEBUG(D_INFO, "md_getattr_name failed "
2074 "on %s: rc %d\n", filename, rc);
2078 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2079 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2081 lmmsize = body->eadatasize;
2083 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2085 GOTO(out, rc = -ENODATA);
2088 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2089 LASSERT(lmm != NULL);
2092 * This is coming from the MDS, so is probably in
2093 * little endian. We convert it to host endian before
2094 * passing it to userspace.
2096 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
2097 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
2098 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
2099 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
2100 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2103 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2104 struct lov_stripe_md *lsm;
2105 struct lov_user_md_join *lmj;
2106 int lmj_size, i, aindex = 0;
2108 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
2110 GOTO(out, rc = -ENOMEM);
2111 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
2113 GOTO(out_free_memmd, rc);
2115 lmj_size = sizeof(struct lov_user_md_join) +
2116 lsm->lsm_stripe_count *
2117 sizeof(struct lov_user_ost_data_join);
2118 OBD_ALLOC(lmj, lmj_size);
2120 GOTO(out_free_memmd, rc = -ENOMEM);
2122 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2123 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2124 struct lov_extent *lex =
2125 &lsm->lsm_array->lai_ext_array[aindex];
2127 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2129 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2130 LPU64" len %d\n", aindex, i,
2131 lex->le_start, (int)lex->le_len);
2132 lmj->lmm_objects[i].l_extent_start =
2135 if ((int)lex->le_len == -1)
2136 lmj->lmm_objects[i].l_extent_end = -1;
2138 lmj->lmm_objects[i].l_extent_end =
2139 lex->le_start + lex->le_len;
2140 lmj->lmm_objects[i].l_object_id =
2141 lsm->lsm_oinfo[i]->loi_id;
2142 lmj->lmm_objects[i].l_object_gr =
2143 lsm->lsm_oinfo[i]->loi_gr;
2144 lmj->lmm_objects[i].l_ost_gen =
2145 lsm->lsm_oinfo[i]->loi_ost_gen;
2146 lmj->lmm_objects[i].l_ost_idx =
2147 lsm->lsm_oinfo[i]->loi_ost_idx;
2149 lmm = (struct lov_mds_md *)lmj;
2152 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2156 *lmm_size = lmmsize;
2161 static int ll_lov_setea(struct inode *inode, struct file *file,
2164 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2165 struct lov_user_md *lump;
2166 int lum_size = sizeof(struct lov_user_md) +
2167 sizeof(struct lov_user_ost_data);
2171 if (!capable (CAP_SYS_ADMIN))
2174 OBD_ALLOC(lump, lum_size);
2178 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2180 OBD_FREE(lump, lum_size);
2184 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2186 OBD_FREE(lump, lum_size);
2190 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2193 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
2195 int flags = FMODE_WRITE;
2198 /* Bug 1152: copy properly when this is no longer true */
2199 LASSERT(sizeof(lum) == sizeof(*lump));
2200 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
2201 rc = copy_from_user(&lum, lump, sizeof(lum));
2205 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
2207 put_user(0, &lump->lmm_stripe_count);
2208 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2209 0, ll_i2info(inode)->lli_smd, lump);
2214 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2216 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2221 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2225 static int ll_get_grouplock(struct inode *inode, struct file *file,
2228 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2229 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2230 .end = OBD_OBJECT_EOF}};
2231 struct lustre_handle lockh = { 0 };
2232 struct ll_inode_info *lli = ll_i2info(inode);
2233 struct lov_stripe_md *lsm = lli->lli_smd;
2237 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2241 policy.l_extent.gid = arg;
2242 if (file->f_flags & O_NONBLOCK)
2243 flags = LDLM_FL_BLOCK_NOWAIT;
2245 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2249 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2251 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2256 static int ll_put_grouplock(struct inode *inode, struct file *file,
2259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2260 struct ll_inode_info *lli = ll_i2info(inode);
2261 struct lov_stripe_md *lsm = lli->lli_smd;
2265 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2266 /* Ugh, it's already unlocked. */
2270 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2273 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2275 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2280 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2285 static int join_sanity_check(struct inode *head, struct inode *tail)
2288 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2289 CERROR("server do not support join \n");
2292 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2293 CERROR("tail ino %lu and ino head %lu must be regular\n",
2294 head->i_ino, tail->i_ino);
2297 if (head->i_ino == tail->i_ino) {
2298 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2301 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2302 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2308 static int join_file(struct inode *head_inode, struct file *head_filp,
2309 struct file *tail_filp)
2311 struct dentry *tail_dentry = tail_filp->f_dentry;
2312 struct lookup_intent oit = {.it_op = IT_OPEN,
2313 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2314 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2315 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2317 struct lustre_handle lockh;
2318 struct md_op_data *op_data;
2323 tail_dentry = tail_filp->f_dentry;
2325 data = i_size_read(head_inode);
2326 op_data = ll_prep_md_op_data(NULL, head_inode,
2327 tail_dentry->d_parent->d_inode,
2328 tail_dentry->d_name.name,
2329 tail_dentry->d_name.len, 0,
2330 LUSTRE_OPC_ANY, &data);
2331 if (IS_ERR(op_data))
2332 RETURN(PTR_ERR(op_data));
2334 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2335 op_data, &lockh, NULL, 0, 0);
2337 ll_finish_md_op_data(op_data);
2341 rc = oit.d.lustre.it_status;
2343 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2344 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2345 ptlrpc_req_finished((struct ptlrpc_request *)
2346 oit.d.lustre.it_data);
2350 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2352 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2353 oit.d.lustre.it_lock_mode = 0;
2355 ll_release_openhandle(head_filp->f_dentry, &oit);
2357 ll_intent_release(&oit);
2361 static int ll_file_join(struct inode *head, struct file *filp,
2362 char *filename_tail)
2364 struct inode *tail = NULL, *first = NULL, *second = NULL;
2365 struct dentry *tail_dentry;
2366 struct file *tail_filp, *first_filp, *second_filp;
2367 struct ll_lock_tree first_tree, second_tree;
2368 struct ll_lock_tree_node *first_node, *second_node;
2369 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2370 int rc = 0, cleanup_phase = 0;
2373 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2374 head->i_ino, head->i_generation, head, filename_tail);
2376 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2377 if (IS_ERR(tail_filp)) {
2378 CERROR("Can not open tail file %s", filename_tail);
2379 rc = PTR_ERR(tail_filp);
2382 tail = igrab(tail_filp->f_dentry->d_inode);
2384 tlli = ll_i2info(tail);
2385 tail_dentry = tail_filp->f_dentry;
2386 LASSERT(tail_dentry);
2389 /*reorder the inode for lock sequence*/
2390 first = head->i_ino > tail->i_ino ? head : tail;
2391 second = head->i_ino > tail->i_ino ? tail : head;
2392 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2393 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2395 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2396 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2397 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2398 if (IS_ERR(first_node)){
2399 rc = PTR_ERR(first_node);
2402 first_tree.lt_fd = first_filp->private_data;
2403 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2408 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2409 if (IS_ERR(second_node)){
2410 rc = PTR_ERR(second_node);
2413 second_tree.lt_fd = second_filp->private_data;
2414 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2419 rc = join_sanity_check(head, tail);
2423 rc = join_file(head, filp, tail_filp);
2427 switch (cleanup_phase) {
2429 ll_tree_unlock(&second_tree);
2430 obd_cancel_unused(ll_i2dtexp(second),
2431 ll_i2info(second)->lli_smd, 0, NULL);
2433 ll_tree_unlock(&first_tree);
2434 obd_cancel_unused(ll_i2dtexp(first),
2435 ll_i2info(first)->lli_smd, 0, NULL);
2437 filp_close(tail_filp, 0);
2440 if (head && rc == 0) {
2441 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2443 hlli->lli_smd = NULL;
2448 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2454 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2456 struct inode *inode = dentry->d_inode;
2457 struct obd_client_handle *och;
2463 /* Root ? Do nothing. */
2464 if (dentry->d_inode->i_sb->s_root == dentry)
2467 /* No open handle to close? Move away */
2468 if (!it_disposition(it, DISP_OPEN_OPEN))
2471 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2473 OBD_ALLOC(och, sizeof(*och));
2475 GOTO(out, rc = -ENOMEM);
2477 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2478 ll_i2info(inode), it, och);
2480 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2483 /* this one is in place of ll_file_open */
2484 ptlrpc_req_finished(it->d.lustre.it_data);
2485 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2489 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2492 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2496 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2497 inode->i_generation, inode, cmd);
2498 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2500 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2501 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2505 case LL_IOC_GETFLAGS:
2506 /* Get the current value of the file flags */
2507 return put_user(fd->fd_flags, (int *)arg);
2508 case LL_IOC_SETFLAGS:
2509 case LL_IOC_CLRFLAGS:
2510 /* Set or clear specific file flags */
2511 /* XXX This probably needs checks to ensure the flags are
2512 * not abused, and to handle any flag side effects.
2514 if (get_user(flags, (int *) arg))
2517 if (cmd == LL_IOC_SETFLAGS) {
2518 if ((flags & LL_FILE_IGNORE_LOCK) &&
2519 !(file->f_flags & O_DIRECT)) {
2520 CERROR("%s: unable to disable locking on "
2521 "non-O_DIRECT file\n", current->comm);
2525 fd->fd_flags |= flags;
2527 fd->fd_flags &= ~flags;
2530 case LL_IOC_LOV_SETSTRIPE:
2531 RETURN(ll_lov_setstripe(inode, file, arg));
2532 case LL_IOC_LOV_SETEA:
2533 RETURN(ll_lov_setea(inode, file, arg));
2534 case LL_IOC_LOV_GETSTRIPE:
2535 RETURN(ll_lov_getstripe(inode, arg));
2536 case LL_IOC_RECREATE_OBJ:
2537 RETURN(ll_lov_recreate_obj(inode, file, arg));
2538 case EXT3_IOC_GETFLAGS:
2539 case EXT3_IOC_SETFLAGS:
2540 RETURN(ll_iocontrol(inode, file, cmd, arg));
2541 case EXT3_IOC_GETVERSION_OLD:
2542 case EXT3_IOC_GETVERSION:
2543 RETURN(put_user(inode->i_generation, (int *)arg));
2548 ftail = getname((const char *)arg);
2550 RETURN(PTR_ERR(ftail));
2551 rc = ll_file_join(inode, file, ftail);
2555 case LL_IOC_GROUP_LOCK:
2556 RETURN(ll_get_grouplock(inode, file, arg));
2557 case LL_IOC_GROUP_UNLOCK:
2558 RETURN(ll_put_grouplock(inode, file, arg));
2559 case IOC_OBD_STATFS:
2560 RETURN(ll_obd_statfs(inode, (void *)arg));
2562 /* We need to special case any other ioctls we want to handle,
2563 * to send them to the MDS/OST as appropriate and to properly
2564 * network encode the arg field.
2565 case EXT3_IOC_SETVERSION_OLD:
2566 case EXT3_IOC_SETVERSION:
2568 case LL_IOC_FLUSHCTX:
2569 RETURN(ll_flush_ctx(inode));
2574 ll_iocontrol_call(inode, file, cmd, arg, &err))
2577 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2583 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2585 struct inode *inode = file->f_dentry->d_inode;
2586 struct ll_inode_info *lli = ll_i2info(inode);
2587 struct lov_stripe_md *lsm = lli->lli_smd;
2590 retval = offset + ((origin == 2) ? i_size_read(inode) :
2591 (origin == 1) ? file->f_pos : 0);
2592 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2593 inode->i_ino, inode->i_generation, inode, retval, retval,
2594 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2595 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2597 if (origin == 2) { /* SEEK_END */
2598 int nonblock = 0, rc;
2600 if (file->f_flags & O_NONBLOCK)
2601 nonblock = LDLM_FL_BLOCK_NOWAIT;
2604 rc = ll_glimpse_size(inode, nonblock);
2609 ll_inode_size_lock(inode, 0);
2610 offset += i_size_read(inode);
2611 ll_inode_size_unlock(inode, 0);
2612 } else if (origin == 1) { /* SEEK_CUR */
2613 offset += file->f_pos;
2617 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2618 if (offset != file->f_pos) {
2619 file->f_pos = offset;
2620 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2622 file->f_version = ++event;
2631 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2633 struct inode *inode = dentry->d_inode;
2634 struct ll_inode_info *lli = ll_i2info(inode);
2635 struct lov_stripe_md *lsm = lli->lli_smd;
2636 struct ptlrpc_request *req;
2637 struct obd_capa *oc;
2640 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2641 inode->i_generation, inode);
2642 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2644 /* fsync's caller has already called _fdata{sync,write}, we want
2645 * that IO to finish before calling the osc and mdc sync methods */
2646 rc = filemap_fdatawait(inode->i_mapping);
2648 /* catch async errors that were recorded back when async writeback
2649 * failed for pages in this mapping. */
2650 err = lli->lli_async_rc;
2651 lli->lli_async_rc = 0;
2655 err = lov_test_and_clear_async_rc(lsm);
2660 oc = ll_mdscapa_get(inode);
2661 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2667 ptlrpc_req_finished(req);
2674 RETURN(rc ? rc : -ENOMEM);
2676 oa->o_id = lsm->lsm_object_id;
2677 oa->o_gr = lsm->lsm_object_gr;
2678 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2679 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2680 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2683 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2684 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2685 0, OBD_OBJECT_EOF, oc);
2695 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2697 struct inode *inode = file->f_dentry->d_inode;
2698 struct ll_sb_info *sbi = ll_i2sbi(inode);
2699 struct ldlm_res_id res_id =
2700 { .name = { fid_seq(ll_inode2fid(inode)),
2701 fid_oid(ll_inode2fid(inode)),
2702 fid_ver(ll_inode2fid(inode)),
2704 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2705 ldlm_flock_completion_ast, NULL, file_lock };
2706 struct lustre_handle lockh = {0};
2707 ldlm_policy_data_t flock;
2712 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2713 inode->i_ino, file_lock);
2715 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2717 if (file_lock->fl_flags & FL_FLOCK) {
2718 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2719 /* set missing params for flock() calls */
2720 file_lock->fl_end = OFFSET_MAX;
2721 file_lock->fl_pid = current->tgid;
2723 flock.l_flock.pid = file_lock->fl_pid;
2724 flock.l_flock.start = file_lock->fl_start;
2725 flock.l_flock.end = file_lock->fl_end;
2727 switch (file_lock->fl_type) {
2729 einfo.ei_mode = LCK_PR;
2732 /* An unlock request may or may not have any relation to
2733 * existing locks so we may not be able to pass a lock handle
2734 * via a normal ldlm_lock_cancel() request. The request may even
2735 * unlock a byte range in the middle of an existing lock. In
2736 * order to process an unlock request we need all of the same
2737 * information that is given with a normal read or write record
2738 * lock request. To avoid creating another ldlm unlock (cancel)
2739 * message we'll treat a LCK_NL flock request as an unlock. */
2740 einfo.ei_mode = LCK_NL;
2743 einfo.ei_mode = LCK_PW;
2746 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2761 flags = LDLM_FL_BLOCK_NOWAIT;
2767 flags = LDLM_FL_TEST_LOCK;
2768 /* Save the old mode so that if the mode in the lock changes we
2769 * can decrement the appropriate reader or writer refcount. */
2770 file_lock->fl_type = einfo.ei_mode;
2773 CERROR("unknown fcntl lock command: %d\n", cmd);
2777 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2778 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2779 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2781 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2782 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2783 if ((file_lock->fl_flags & FL_FLOCK) &&
2784 (rc == 0 || file_lock->fl_type == F_UNLCK))
2785 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2786 #ifdef HAVE_F_OP_FLOCK
2787 if ((file_lock->fl_flags & FL_POSIX) &&
2788 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2789 !(flags & LDLM_FL_TEST_LOCK))
2790 posix_lock_file_wait(file, file_lock);
2796 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2803 int ll_have_md_lock(struct inode *inode, __u64 bits)
2805 struct lustre_handle lockh;
2806 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2814 fid = &ll_i2info(inode)->lli_fid;
2815 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2817 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2818 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2819 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2825 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2826 struct lustre_handle *lockh)
2828 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2834 fid = &ll_i2info(inode)->lli_fid;
2835 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2837 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2838 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2839 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2843 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2844 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2845 * and return success */
2847 /* This path cannot be hit for regular files unless in
2848 * case of obscure races, so no need to to validate
2850 if (!S_ISREG(inode->i_mode) &&
2851 !S_ISDIR(inode->i_mode))
2856 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2864 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2866 struct inode *inode = dentry->d_inode;
2867 struct ptlrpc_request *req = NULL;
2868 struct ll_sb_info *sbi;
2869 struct obd_export *exp;
2874 CERROR("REPORT THIS LINE TO PETER\n");
2877 sbi = ll_i2sbi(inode);
2879 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2880 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2882 exp = ll_i2mdexp(inode);
2884 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2885 struct lookup_intent oit = { .it_op = IT_GETATTR };
2886 struct md_op_data *op_data;
2888 /* Call getattr by fid, so do not provide name at all. */
2889 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2890 dentry->d_inode, NULL, 0, 0,
2891 LUSTRE_OPC_ANY, NULL);
2892 if (IS_ERR(op_data))
2893 RETURN(PTR_ERR(op_data));
2895 oit.it_flags |= O_CHECK_STALE;
2896 rc = md_intent_lock(exp, op_data, NULL, 0,
2897 /* we are not interested in name
2900 ll_md_blocking_ast, 0);
2901 ll_finish_md_op_data(op_data);
2902 oit.it_flags &= ~O_CHECK_STALE;
2904 rc = ll_inode_revalidate_fini(inode, rc);
2908 rc = ll_revalidate_it_finish(req, &oit, dentry);
2910 ll_intent_release(&oit);
2914 /* Unlinked? Unhash dentry, so it is not picked up later by
2915 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2916 here to preserve get_cwd functionality on 2.6.
2918 if (!dentry->d_inode->i_nlink) {
2919 spin_lock(&dcache_lock);
2920 ll_drop_dentry(dentry);
2921 spin_unlock(&dcache_lock);
2924 ll_lookup_finish_locks(&oit, dentry);
2925 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2926 MDS_INODELOCK_LOOKUP)) {
2927 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2928 obd_valid valid = OBD_MD_FLGETATTR;
2929 struct obd_capa *oc;
2932 if (S_ISREG(inode->i_mode)) {
2933 rc = ll_get_max_mdsize(sbi, &ealen);
2936 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2938 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2939 * capa for this inode. Because we only keep capas of dirs
2941 oc = ll_mdscapa_get(inode);
2942 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2946 rc = ll_inode_revalidate_fini(inode, rc);
2950 rc = ll_prep_inode(&inode, req, NULL);
2955 /* if object not yet allocated, don't validate size */
2956 if (ll_i2info(inode)->lli_smd == NULL)
2959 /* ll_glimpse_size will prefer locally cached writes if they extend
2961 rc = ll_glimpse_size(inode, 0);
2964 ptlrpc_req_finished(req);
2968 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2969 struct lookup_intent *it, struct kstat *stat)
2971 struct inode *inode = de->d_inode;
2974 res = ll_inode_revalidate_it(de, it);
2975 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2980 stat->dev = inode->i_sb->s_dev;
2981 stat->ino = inode->i_ino;
2982 stat->mode = inode->i_mode;
2983 stat->nlink = inode->i_nlink;
2984 stat->uid = inode->i_uid;
2985 stat->gid = inode->i_gid;
2986 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2987 stat->atime = inode->i_atime;
2988 stat->mtime = inode->i_mtime;
2989 stat->ctime = inode->i_ctime;
2990 #ifdef HAVE_INODE_BLKSIZE
2991 stat->blksize = inode->i_blksize;
2993 stat->blksize = 1 << inode->i_blkbits;
2996 ll_inode_size_lock(inode, 0);
2997 stat->size = i_size_read(inode);
2998 stat->blocks = inode->i_blocks;
2999 ll_inode_size_unlock(inode, 0);
3003 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3005 struct lookup_intent it = { .it_op = IT_GETATTR };
3007 return ll_getattr_it(mnt, de, &it, stat);
3011 int lustre_check_acl(struct inode *inode, int mask)
3013 #ifdef CONFIG_FS_POSIX_ACL
3014 struct ll_inode_info *lli = ll_i2info(inode);
3015 struct posix_acl *acl;
3019 spin_lock(&lli->lli_lock);
3020 acl = posix_acl_dup(lli->lli_posix_acl);
3021 spin_unlock(&lli->lli_lock);
3026 rc = posix_acl_permission(inode, acl, mask);
3027 posix_acl_release(acl);
3035 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3036 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3038 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3039 inode->i_ino, inode->i_generation, inode, mask);
3040 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3041 return lustre_check_remote_perm(inode, mask);
3043 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3044 return generic_permission(inode, mask, lustre_check_acl);
3047 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3049 int mode = inode->i_mode;
3052 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3053 inode->i_ino, inode->i_generation, inode, mask);
3055 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3056 return lustre_check_remote_perm(inode, mask);
3058 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3060 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3061 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3063 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3065 if (current->fsuid == inode->i_uid) {
3068 if (((mode >> 3) & mask & S_IRWXO) != mask)
3070 rc = lustre_check_acl(inode, mask);
3074 goto check_capabilities;
3078 if (in_group_p(inode->i_gid))
3081 if ((mode & mask & S_IRWXO) == mask)
3085 if (!(mask & MAY_EXEC) ||
3086 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3087 if (capable(CAP_DAC_OVERRIDE))
3090 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3091 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3098 /* -o localflock - only provides locally consistent flock locks */
3099 struct file_operations ll_file_operations = {
3100 .read = ll_file_read,
3101 .write = ll_file_write,
3102 .ioctl = ll_file_ioctl,
3103 .open = ll_file_open,
3104 .release = ll_file_release,
3105 .mmap = ll_file_mmap,
3106 .llseek = ll_file_seek,
3107 .sendfile = ll_file_sendfile,
3111 struct file_operations ll_file_operations_flock = {
3112 .read = ll_file_read,
3113 .write = ll_file_write,
3114 .ioctl = ll_file_ioctl,
3115 .open = ll_file_open,
3116 .release = ll_file_release,
3117 .mmap = ll_file_mmap,
3118 .llseek = ll_file_seek,
3119 .sendfile = ll_file_sendfile,
3121 #ifdef HAVE_F_OP_FLOCK
3122 .flock = ll_file_flock,
3124 .lock = ll_file_flock
3127 /* These are for -o noflock - to return ENOSYS on flock calls */
3128 struct file_operations ll_file_operations_noflock = {
3129 .read = ll_file_read,
3130 .write = ll_file_write,
3131 .ioctl = ll_file_ioctl,
3132 .open = ll_file_open,
3133 .release = ll_file_release,
3134 .mmap = ll_file_mmap,
3135 .llseek = ll_file_seek,
3136 .sendfile = ll_file_sendfile,
3138 #ifdef HAVE_F_OP_FLOCK
3139 .flock = ll_file_noflock,
3141 .lock = ll_file_noflock
3144 struct inode_operations ll_file_inode_operations = {
3145 #ifdef HAVE_VFS_INTENT_PATCHES
3146 .setattr_raw = ll_setattr_raw,
3148 .setattr = ll_setattr,
3149 .truncate = ll_truncate,
3150 .getattr = ll_getattr,
3151 .permission = ll_inode_permission,
3152 .setxattr = ll_setxattr,
3153 .getxattr = ll_getxattr,
3154 .listxattr = ll_listxattr,
3155 .removexattr = ll_removexattr,
3158 /* dynamic ioctl number support routins */
3159 static struct llioc_ctl_data {
3160 struct rw_semaphore ioc_sem;
3161 struct list_head ioc_head;
3163 __RWSEM_INITIALIZER(llioc.ioc_sem),
3164 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3169 struct list_head iocd_list;
3170 unsigned int iocd_size;
3171 llioc_callback_t iocd_cb;
3172 unsigned int iocd_count;
3173 unsigned int iocd_cmd[0];
3176 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3179 struct llioc_data *in_data = NULL;
3182 if (cb == NULL || cmd == NULL ||
3183 count > LLIOC_MAX_CMD || count < 0)
3186 size = sizeof(*in_data) + count * sizeof(unsigned int);
3187 OBD_ALLOC(in_data, size);
3188 if (in_data == NULL)
3191 memset(in_data, 0, sizeof(*in_data));
3192 in_data->iocd_size = size;
3193 in_data->iocd_cb = cb;
3194 in_data->iocd_count = count;
3195 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3197 down_write(&llioc.ioc_sem);
3198 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3199 up_write(&llioc.ioc_sem);
3204 void ll_iocontrol_unregister(void *magic)
3206 struct llioc_data *tmp;
3211 down_write(&llioc.ioc_sem);
3212 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3214 unsigned int size = tmp->iocd_size;
3216 list_del(&tmp->iocd_list);
3217 up_write(&llioc.ioc_sem);
3219 OBD_FREE(tmp, size);
3223 up_write(&llioc.ioc_sem);
3225 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3228 EXPORT_SYMBOL(ll_iocontrol_register);
3229 EXPORT_SYMBOL(ll_iocontrol_unregister);
3231 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3232 unsigned int cmd, unsigned long arg, int *rcp)
3234 enum llioc_iter ret = LLIOC_CONT;
3235 struct llioc_data *data;
3236 int rc = -EINVAL, i;
3238 down_read(&llioc.ioc_sem);
3239 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3240 for (i = 0; i < data->iocd_count; i++) {
3241 if (cmd != data->iocd_cmd[i])
3244 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3248 if (ret == LLIOC_STOP)
3251 up_read(&llioc.ioc_sem);