1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * This code is issued under the GNU General Public License.
7 * See the file COPYING in this distribution
9 * Copyright (C) 1992, 1993, 1994, 1995
10 * Remy Card (card@masi.ibp.fr)
11 * Laboratoire MASI - Institut Blaise Pascal
12 * Universite Pierre et Marie Curie (Paris VI)
16 * linux/fs/minix/file.c
18 * Copyright (C) 1991, 1992 Linus Torvalds
20 * ext2 fs regular file handling primitives
22 * 64-bit file support on 64-bit platforms by Jakub Jelinek
23 * (jj@sunsite.ms.mff.cuni.cz)
26 #define DEBUG_SUBSYSTEM S_LLITE
28 #include <linux/lustre_dlm.h>
29 #include <linux/lustre_lite.h>
30 #include <linux/obd_lov.h> /* for lov_mds_md_size() in lov_setstripe() */
31 #include <linux/random.h>
33 int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
34 extern int ll_setattr(struct dentry *de, struct iattr *attr);
36 static int ll_mdc_open(struct lustre_handle *mdc_conn, struct inode *inode,
37 struct file *file, struct lov_mds_md *lmm, int lmm_size)
39 struct ptlrpc_request *req = NULL;
40 struct ll_file_data *fd;
44 LASSERT(!file->private_data);
46 fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL);
50 memset(fd, 0, sizeof(*fd));
51 fd->fd_mdshandle.addr = (__u64)(unsigned long)file;
52 get_random_bytes(&fd->fd_mdshandle.cookie,
53 sizeof(fd->fd_mdshandle.cookie));
55 rc = mdc_open(mdc_conn, inode->i_ino, S_IFREG | inode->i_mode,
56 file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req);
58 /* This is the "reply" refcount. */
59 ptlrpc_req_finished(req);
65 file->private_data = fd;
67 if (!fd->fd_mdshandle.addr ||
68 fd->fd_mdshandle.addr == (__u64)(unsigned long)file) {
69 CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n");
70 /* XXX handle this how, abort or is it non-fatal? */
73 file->f_flags &= ~O_LOV_DELAY_CREATE;
77 fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
78 kmem_cache_free(ll_file_data_slab, fd);
83 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
86 struct ll_file_data *fd = file->private_data;
87 struct ptlrpc_request *req = NULL;
89 struct obd_import *imp = fd->fd_req->rq_import;
92 /* Complete the open request and remove it from replay list */
93 DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req);
94 rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
95 inode->i_mode, &fd->fd_mdshandle, &req);
98 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
99 ptlrpc_req_finished(req);
101 spin_lock_irqsave(&imp->imp_lock, flags);
102 if (fd->fd_req->rq_transno) {
103 /* This caused an EA to be written, need to replay as a normal
104 * transaction now. Our reference is now effectively owned
105 * by the imp_replay_list, and we'll be committed just like
106 * other transno-having requests now.
108 fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY;
109 spin_unlock_irqrestore(&imp->imp_lock, flags);
111 /* No transno means that we can just drop our ref. */
112 spin_unlock_irqrestore(&imp->imp_lock, flags);
113 ptlrpc_req_finished(fd->fd_req);
115 fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
116 file->private_data = NULL;
117 kmem_cache_free(ll_file_data_slab, fd);
122 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
123 struct file *file, struct lov_stripe_md *lsm)
125 struct ll_file_data *fd;
133 oa->o_id = lsm->lsm_object_id;
134 oa->o_mode = S_IFREG;
135 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
137 rc = obd_open(conn, oa, lsm);
141 obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
143 fd = file->private_data;
144 obd_oa2handle(&fd->fd_osthandle, oa);
146 atomic_inc(&ll_i2info(inode)->lli_open_count);
152 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
153 * duplicate objects from being created. We only install lsm to lli_smd if
154 * the mdc open was successful (hence stored stripe MD on MDS), otherwise
155 * other nodes could try to create different objects for the same file.
157 static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode,
158 struct file *file, struct lov_stripe_md *lsm)
160 struct ll_inode_info *lli = ll_i2info(inode);
161 struct lov_mds_md *lmm = NULL;
171 oa->o_mode = S_IFREG | 0600;
172 oa->o_id = inode->i_ino;
173 /* Keep these 0 for now, because chown/chgrp does not change the
174 * ownership on the OST, and we don't want to allow BA OST NFS
175 * users to access these objects by mistake.
179 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
180 OBD_MD_FLUID | OBD_MD_FLGID;
182 rc = obd_create(conn, oa, &lsm);
184 CERROR("error creating objects for inode %lu: rc = %d\n",
189 LASSERT(lsm && lsm->lsm_object_id);
190 rc = obd_packmd(conn, &lmm, lsm);
192 GOTO(out_destroy, rc);
196 rc = ll_mdc_open(&ll_i2sbi(inode)->ll_mdc_conn,inode,file,lmm,lmm_size);
198 obd_free_wiremd(conn, &lmm);
200 /* If we couldn't complete mdc_open() and store the stripe MD on the
201 * MDS, we need to destroy the objects now or they will be leaked.
204 CERROR("error MDS opening %lu with delayed create: rc %d\n",
206 GOTO(out_destroy, rc);
216 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
217 oa->o_id = lsm->lsm_object_id;
218 oa->o_valid |= OBD_MD_FLID;
219 err = obd_destroy(conn, oa, lsm);
220 obd_free_memmd(conn, &lsm);
222 CERROR("error uncreating inode %lu objects: rc %d\n",
227 /* Open a file, and (for the very first open) create objects on the OSTs at
228 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
229 * creation or open until ll_lov_setstripe() ioctl is called. We grab
230 * lli_open_sem to ensure no other process will create objects, send the
231 * stripe MD to the MDS, or try to destroy the objects if that fails.
233 * If we already have the stripe MD locally, we don't request it in
234 * mdc_open() by passing a lmm_size = 0.
236 * It is up to the application to ensure no other processes open this file
237 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
238 * used. We might be able to avoid races of that sort by getting lli_open_sem
239 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
240 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
242 static int ll_file_open(struct inode *inode, struct file *file)
244 struct ll_sb_info *sbi = ll_i2sbi(inode);
245 struct ll_inode_info *lli = ll_i2info(inode);
246 struct lustre_handle *conn = ll_i2obdconn(inode);
247 struct lov_stripe_md *lsm;
253 if (file->f_flags & O_LOV_DELAY_CREATE) {
254 CDEBUG(D_INODE, "delaying object creation\n");
258 down(&lli->lli_open_sem);
260 rc = ll_create_open_obj(conn, inode, file, NULL);
261 up(&lli->lli_open_sem);
263 CERROR("stripe already set on ino %lu\n", inode->i_ino);
264 up(&lli->lli_open_sem);
265 rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file,NULL,0);
269 rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file, NULL, 0);
274 rc = ll_osc_open(conn, inode, file, lsm);
279 ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
283 int ll_size_lock(struct inode *inode, struct lov_stripe_md *lsm, obd_off start,
284 int mode, struct lustre_handle *lockh)
286 struct ll_sb_info *sbi = ll_i2sbi(inode);
287 struct ldlm_extent extent;
291 /* XXX phil: can we do this? won't it screw the file size up? */
292 if (sbi->ll_flags & LL_SBI_NOLCK)
295 extent.start = start;
296 extent.end = OBD_OBJECT_EOF;
298 rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, &extent,
299 sizeof(extent), mode, &flags, ll_lock_callback,
300 inode, sizeof(*inode), lockh);
304 int ll_size_unlock(struct inode *inode, struct lov_stripe_md *lsm, int mode,
305 struct lustre_handle *lockh)
307 struct ll_sb_info *sbi = ll_i2sbi(inode);
311 /* XXX phil: can we do this? won't it screw the file size up? */
312 if (sbi->ll_flags & LL_SBI_NOLCK)
315 rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
316 if (rc != ELDLM_OK) {
317 CERROR("lock cancel: %d\n", rc);
324 int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm)
326 struct ll_sb_info *sbi = ll_i2sbi(inode);
327 //struct lustre_handle lockh = { 0, 0 };
336 /* XXX do not yet need size lock - OST size always correct (sync write)
337 rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockh);
338 if (rc != ELDLM_OK) {
339 CERROR("lock enqueue: %d\n", rc);
344 memset(&oa, 0, sizeof oa);
345 oa.o_id = lsm->lsm_object_id;
347 oa.o_valid = OBD_MD_FLID|OBD_MD_FLTYPE|OBD_MD_FLSIZE|OBD_MD_FLBLOCKS;
348 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
350 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
351 CDEBUG(D_INODE, LPX64" size %Lu/%Lu\n",
352 lsm->lsm_object_id, inode->i_size, inode->i_size);
354 /* XXX do not need size lock, because OST size always correct (sync write)
355 err = ll_size_unlock(inode, lsm, LCK_PR, &lockh);
356 if (err != ELDLM_OK) {
357 CERROR("lock cancel: %d\n", err);
365 /* While this returns an error code, fput() the caller does not, so we need
366 * to make every effort to clean up all of our state here. Also, applications
367 * rarely check close errors and even if an error is returned they will not
368 * re-try the close call.
370 static int ll_file_release(struct inode *inode, struct file *file)
372 struct ll_file_data *fd;
374 struct ll_sb_info *sbi = ll_i2sbi(inode);
375 struct ll_inode_info *lli = ll_i2info(inode);
376 struct lov_stripe_md *lsm = lli->lli_smd;
381 fd = (struct ll_file_data *)file->private_data;
382 if (!fd) /* no process opened the file after an mcreate */
385 memset(&oa, 0, sizeof(oa));
386 oa.o_id = lsm->lsm_object_id;
388 oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
389 obd_handle2oa(&oa, &fd->fd_osthandle);
390 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm);
392 CERROR("inode %lu object close failed: rc = %d\n",
395 rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
399 if (atomic_dec_and_test(&lli->lli_open_count)) {
400 CDEBUG(D_INFO, "last close, cancelling unused locks\n");
401 rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0);
404 CERROR("obd_cancel_unused: %d\n", rc);
407 CDEBUG(D_INFO, "not last close, not cancelling unused locks\n");
412 static inline void ll_remove_suid(struct inode *inode)
416 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
417 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
419 /* was any of the uid bits set? */
420 mode &= inode->i_mode;
421 if (mode && !capable(CAP_FSETID)) {
422 inode->i_mode &= ~mode;
423 // XXX careful here - we cannot change the size
427 static void ll_update_atime(struct inode *inode)
431 attr.ia_atime = CURRENT_TIME;
432 attr.ia_valid = ATTR_ATIME;
434 if (inode->i_atime == attr.ia_atime) return;
435 if (IS_RDONLY(inode)) return;
436 if (IS_NOATIME(inode)) return;
438 /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */
439 ll_inode_setattr(inode, &attr, 0);
442 int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
443 void *data, __u32 data_len, int flag)
445 struct inode *inode = data;
446 struct lustre_handle lockh = { 0, 0 };
450 if (data_len != sizeof(struct inode))
457 case LDLM_CB_BLOCKING:
458 ldlm_lock2handle(lock, &lockh);
459 rc = ldlm_cli_cancel(&lockh);
461 CERROR("ldlm_cli_cancel failed: %d\n", rc);
463 case LDLM_CB_CANCELING:
464 CDEBUG(D_INODE, "invalidating obdo/inode %lu\n", inode->i_ino);
465 /* FIXME: do something better than throwing away everything */
466 //down(&inode->i_sem);
467 ll_invalidate_inode_pages(inode);
477 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
480 struct ll_file_data *fd = (struct ll_file_data *)filp->private_data;
481 struct inode *inode = filp->f_dentry->d_inode;
482 struct ll_sb_info *sbi = ll_i2sbi(inode);
483 struct lustre_handle lockh = { 0, 0 };
484 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
490 /* If we don't refresh the file size, generic_file_read may not even
492 retval = ll_file_size(inode, lsm);
494 CERROR("ll_file_size: "LPSZ"\n", retval);
498 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
499 !(sbi->ll_flags & LL_SBI_NOLCK)) {
500 struct ldlm_extent extent;
501 extent.start = *ppos;
502 extent.end = *ppos + count;
503 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
504 inode->i_ino, extent.start, extent.end);
506 err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT,
507 &extent, sizeof(extent), LCK_PR, &flags,
508 ll_lock_callback, inode, sizeof(*inode),
510 if (err != ELDLM_OK) {
511 CERROR("lock enqueue: err: %d\n", err);
516 CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
517 inode->i_ino, count, *ppos);
518 retval = generic_file_read(filp, buf, count, ppos);
521 ll_update_atime(inode);
523 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
524 !(sbi->ll_flags & LL_SBI_NOLCK)) {
525 err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PR, &lockh);
526 if (err != ELDLM_OK) {
527 CERROR("lock cancel: err: %d\n", err);
536 * Write to a file (through the page cache).
539 ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
541 struct ll_file_data *fd = (struct ll_file_data *)file->private_data;
542 struct inode *inode = file->f_dentry->d_inode;
543 struct ll_sb_info *sbi = ll_i2sbi(inode);
544 struct lustre_handle lockh = { 0, 0 }, eof_lockh = { 0, 0 };
545 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
551 if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) {
558 err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockh);
564 oa->o_id = lsm->lsm_object_id;
565 oa->o_mode = inode->i_mode;
566 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
568 obd_handle2oa(oa, &fd->fd_osthandle);
569 retval = obd_getattr(&sbi->ll_osc_conn, oa, lsm);
572 GOTO(out_eof, retval);
576 obdo_to_inode(inode, oa, oa->o_valid);
580 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
581 !(sbi->ll_flags & LL_SBI_NOLCK)) {
582 struct ldlm_extent extent;
583 extent.start = *ppos;
584 extent.end = *ppos + count;
585 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
586 inode->i_ino, extent.start, extent.end);
588 err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT,
589 &extent, sizeof(extent), LCK_PW, &flags,
590 ll_lock_callback, inode, sizeof(*inode),
592 if (err != ELDLM_OK) {
593 CERROR("lock enqueue: err: %d\n", err);
594 GOTO(out_eof, retval = err);
598 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
599 inode->i_ino, count, *ppos);
601 retval = generic_file_write(file, buf, count, ppos);
603 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
604 sbi->ll_flags & LL_SBI_NOLCK) {
605 err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, &lockh);
606 if (err != ELDLM_OK) {
607 CERROR("lock cancel: err: %d\n", err);
608 GOTO(out_eof, retval = err);
614 if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) {
615 err = ll_size_unlock(inode, lsm, LCK_PW, &eof_lockh);
623 static int ll_lov_setstripe(struct inode *inode, struct file *file,
626 struct ll_inode_info *lli = ll_i2info(inode);
627 struct lustre_handle *conn;
628 struct lov_stripe_md *lsm;
632 down(&lli->lli_open_sem);
635 up(&lli->lli_open_sem);
636 CERROR("stripe already set for ino %lu\n", inode->i_ino);
637 /* If we haven't already done the open, do so now */
638 if (file->f_flags & O_LOV_DELAY_CREATE) {
639 int rc2 = ll_file_open(inode, file);
647 conn = ll_i2obdconn(inode);
649 rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
651 rc = ll_create_open_obj(conn, inode, file, lsm);
652 up(&lli->lli_open_sem);
655 obd_free_memmd(conn, &lsm);
658 rc = ll_osc_open(conn, inode, file, lli->lli_smd);
662 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
664 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
665 struct lustre_handle *conn = ll_i2obdconn(inode);
670 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
673 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
676 struct ll_file_data *fd = (struct ll_file_data *)file->private_data;
677 struct lustre_handle *conn;
681 case LL_IOC_GETFLAGS:
682 /* Get the current value of the file flags */
683 return put_user(fd->fd_flags, (int *)arg);
684 case LL_IOC_SETFLAGS:
685 case LL_IOC_CLRFLAGS:
686 /* Set or clear specific file flags */
687 /* XXX This probably needs checks to ensure the flags are
688 * not abused, and to handle any flag side effects.
690 if (get_user(flags, (int *) arg))
693 if (cmd == LL_IOC_SETFLAGS)
694 fd->fd_flags |= flags;
696 fd->fd_flags &= ~flags;
698 case LL_IOC_LOV_SETSTRIPE:
699 return ll_lov_setstripe(inode, file, arg);
700 case LL_IOC_LOV_GETSTRIPE:
701 return ll_lov_getstripe(inode, arg);
703 /* We need to special case any other ioctls we want to handle,
704 * to send them to the MDS/OST as appropriate and to properly
705 * network encode the arg field.
706 case EXT2_IOC_GETFLAGS:
707 case EXT2_IOC_SETFLAGS:
708 case EXT2_IOC_GETVERSION_OLD:
709 case EXT2_IOC_GETVERSION_NEW:
710 case EXT2_IOC_SETVERSION_OLD:
711 case EXT2_IOC_SETVERSION_NEW:
714 conn = ll_i2obdconn(inode);
715 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
719 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
721 struct inode *inode = file->f_dentry->d_inode;
727 struct ll_inode_info *lli = ll_i2info(inode);
729 retval = ll_file_size(inode, lli->lli_smd);
733 offset += inode->i_size;
737 offset += file->f_pos;
740 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
741 if (offset != file->f_pos) {
742 file->f_pos = offset;
743 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
746 file->f_version = ++event;
753 /* XXX this does not need to do anything for data, it _does_ need to
755 int ll_fsync(struct file *file, struct dentry *dentry, int data)
760 static int ll_inode_revalidate(struct dentry *dentry)
762 struct inode *inode = dentry->d_inode;
763 struct lov_stripe_md *lsm;
767 CERROR("REPORT THIS LINE TO PETER\n");
771 if (!ll_have_md_lock(dentry)) {
772 struct ptlrpc_request *req = NULL;
773 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
774 struct mds_body *body;
775 unsigned long valid = 0;
779 if (S_ISREG(inode->i_mode)) {
780 datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL);
781 valid |= OBD_MD_FLEASIZE;
783 rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino,
784 inode->i_mode, valid, datalen, &req);
786 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
787 ptlrpc_req_finished(req);
791 body = lustre_msg_buf(req->rq_repmsg, 0);
792 ll_update_inode(inode, body);
793 ptlrpc_req_finished(req);
796 lsm = ll_i2info(inode)->lli_smd;
797 if (!lsm) /* object not yet allocated, don't validate size */
800 RETURN(ll_file_size(inode, lsm));
803 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
804 static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
807 return ll_inode_revalidate(de);
811 struct file_operations ll_file_operations = {
813 write: ll_file_write,
814 ioctl: ll_file_ioctl,
816 release: ll_file_release,
817 mmap: generic_file_mmap,
818 llseek: ll_file_seek,
822 struct inode_operations ll_file_inode_operations = {
824 truncate: ll_truncate,
825 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
828 revalidate: ll_inode_revalidate,