1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <linux/pagemap.h>
29 #include <linux/file.h>
30 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
31 #include <linux/lustre_compat25.h>
33 #include "llite_internal.h"
35 /* also used by llite/special.c:ll_special_open() */
36 struct ll_file_data *ll_file_data_get(void)
38 struct ll_file_data *fd;
40 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
44 static void ll_file_data_put(struct ll_file_data *fd)
47 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
50 static int ll_close_inode_openhandle(struct inode *inode,
51 struct obd_client_handle *och)
53 struct ptlrpc_request *req = NULL;
54 struct obd_device *obd;
59 obd = class_exp2obd(ll_i2mdcexp(inode));
61 CERROR("Invalid MDC connection handle "LPX64"\n",
62 ll_i2mdcexp(inode)->exp_handle.h_cookie);
67 * here we check if this is forced umount. If so this is called on
68 * canceling "open lock" and we do not call mdc_close() in this case, as
69 * it will not be successful, as import is already deactivated.
71 if (obd->obd_no_recov)
76 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
78 oa->o_id = inode->i_ino;
79 oa->o_valid = OBD_MD_FLID;
80 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
81 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
82 OBD_MD_FLATIME | OBD_MD_FLMTIME |
84 if (0 /* ll_is_inode_dirty(inode) */) {
85 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
86 oa->o_valid |= OBD_MD_FLFLAGS;
89 rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
91 /* We are the last writer, so the MDS has instructed us to get
92 * the file size and any write cookies, then close again. */
93 //ll_queue_done_writing(inode);
96 CERROR("inode %lu mdc close failed: rc = %d\n",
103 rc = ll_objects_destroy(req, inode);
105 CERROR("inode %lu ll_objects destroy: rc = %d\n",
109 ptlrpc_req_finished(req); /* This is close request */
112 mdc_clear_open_replay_data(och);
117 int ll_mdc_real_close(struct inode *inode, int flags)
119 struct ll_inode_info *lli = ll_i2info(inode);
121 struct obd_client_handle **och_p;
122 struct obd_client_handle *och;
127 if (flags & FMODE_WRITE) {
128 och_p = &lli->lli_mds_write_och;
129 och_usecount = &lli->lli_open_fd_write_count;
130 } else if (flags & FMODE_EXEC) {
131 och_p = &lli->lli_mds_exec_och;
132 och_usecount = &lli->lli_open_fd_exec_count;
134 LASSERT(flags & FMODE_READ);
135 och_p = &lli->lli_mds_read_och;
136 och_usecount = &lli->lli_open_fd_read_count;
139 down(&lli->lli_och_sem);
140 if (*och_usecount) { /* There are still users of this handle, so
142 up(&lli->lli_och_sem);
147 up(&lli->lli_och_sem);
149 if (och) { /* There might be a race and somebody have freed this och
151 rc = ll_close_inode_openhandle(inode, och);
152 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
153 OBD_FREE(och, sizeof *och);
159 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
162 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
163 struct ll_inode_info *lli = ll_i2info(inode);
167 /* clear group lock, if present */
168 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
169 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
170 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
171 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
175 /* Let's see if we have good enough OPEN lock on the file and if
176 we can skip talking to MDS */
177 if (file->f_dentry->d_inode) { /* Can this ever be false? */
179 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
180 struct lustre_handle lockh;
181 struct inode *inode = file->f_dentry->d_inode;
182 struct ldlm_res_id file_res_id = {.name={inode->i_ino,
183 inode->i_generation}};
184 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
186 down(&lli->lli_och_sem);
187 if (fd->fd_omode & FMODE_WRITE) {
189 LASSERT(lli->lli_open_fd_write_count);
190 lli->lli_open_fd_write_count--;
191 } else if (fd->fd_omode & FMODE_EXEC) {
193 LASSERT(lli->lli_open_fd_exec_count);
194 lli->lli_open_fd_exec_count--;
197 LASSERT(lli->lli_open_fd_read_count);
198 lli->lli_open_fd_read_count--;
200 up(&lli->lli_och_sem);
202 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
203 &file_res_id, LDLM_IBITS, &policy,lockmode,
205 rc = ll_mdc_real_close(file->f_dentry->d_inode,
209 CERROR("Releasing a file %p with negative dentry %p. Name %s",
210 file, file->f_dentry, file->f_dentry->d_name.name);
213 LUSTRE_FPRIVATE(file) = NULL;
214 ll_file_data_put(fd);
219 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
221 /* While this returns an error code, fput() the caller does not, so we need
222 * to make every effort to clean up all of our state here. Also, applications
223 * rarely check close errors and even if an error is returned they will not
224 * re-try the close call.
226 int ll_file_release(struct inode *inode, struct file *file)
228 struct ll_file_data *fd;
229 struct ll_sb_info *sbi = ll_i2sbi(inode);
230 struct ll_inode_info *lli = ll_i2info(inode);
231 struct lov_stripe_md *lsm = lli->lli_smd;
235 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
236 inode->i_generation, inode);
237 ll_vfs_ops_tally(sbi, VFS_OPS_RELEASE);
239 /* don't do anything for / */
240 if (inode->i_sb->s_root == file->f_dentry)
243 lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
245 fd = LUSTRE_FPRIVATE(file);
249 lov_test_and_clear_async_rc(lsm);
250 lli->lli_async_rc = 0;
252 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
256 static int ll_intent_file_open(struct file *file, void *lmm,
257 int lmmsize, struct lookup_intent *itp)
259 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
260 struct mdc_op_data data;
261 struct dentry *parent = file->f_dentry->d_parent;
262 const char *name = file->f_dentry->d_name.name;
263 const int len = file->f_dentry->d_name.len;
264 struct inode *inode = file->f_dentry->d_inode;
265 struct ptlrpc_request *req;
271 ll_prepare_mdc_op_data(&data, parent->d_inode, inode, name, len, O_RDWR);
273 /* Usually we come here only for NFSD, and we want open lock.
274 But we can also get here with pre 2.6.15 patchless kernels, and in
275 that case that lock is also ok */
276 /* We can also get here if there was cached open handle in revalidate_it
277 * but it disappeared while we were getting from there to ll_file_open.
278 * But this means this file was closed and immediatelly opened which
279 * makes a good candidate for using OPEN lock */
280 /* If lmmsize & lmm are not 0, we are just setting stripe info
281 * parameters. No need for the open lock */
282 if (!lmm && !lmmsize)
283 itp->it_flags |= MDS_OPEN_LOCK;
285 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
286 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
288 /* reason for keep own exit path - don`t flood log
289 * with messages with -ESTALE errors.
291 if (!it_disposition(itp, DISP_OPEN_OPEN))
293 ll_release_openhandle(file->f_dentry, itp);
298 CERROR("lock enqueue: err: %d\n", rc);
302 if (itp->d.lustre.it_lock_mode)
303 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
306 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
307 req, DLM_REPLY_REC_OFF, NULL);
309 ptlrpc_req_finished(itp->d.lustre.it_data);
312 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
313 ll_intent_drop_lock(itp);
319 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
320 struct obd_client_handle *och)
322 struct ptlrpc_request *req = it->d.lustre.it_data;
323 struct mds_body *body;
327 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
328 LASSERT(body != NULL); /* reply already checked out */
329 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in mdc_enqueue */
331 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
332 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
333 lli->lli_io_epoch = body->io_epoch;
335 mdc_set_open_replay_data(och, it->d.lustre.it_data);
338 int ll_local_open(struct file *file, struct lookup_intent *it,
339 struct ll_file_data *fd, struct obd_client_handle *och)
343 LASSERT(!LUSTRE_FPRIVATE(file));
348 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
349 LUSTRE_FPRIVATE(file) = fd;
350 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
351 fd->fd_omode = it->it_flags;
356 /* Open a file, and (for the very first open) create objects on the OSTs at
357 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
358 * creation or open until ll_lov_setstripe() ioctl is called. We grab
359 * lli_open_sem to ensure no other process will create objects, send the
360 * stripe MD to the MDS, or try to destroy the objects if that fails.
362 * If we already have the stripe MD locally then we don't request it in
363 * mdc_open(), by passing a lmm_size = 0.
365 * It is up to the application to ensure no other processes open this file
366 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
367 * used. We might be able to avoid races of that sort by getting lli_open_sem
368 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
369 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
371 int ll_file_open(struct inode *inode, struct file *file)
373 struct ll_inode_info *lli = ll_i2info(inode);
374 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
375 .it_flags = file->f_flags };
376 struct lov_stripe_md *lsm;
377 struct ptlrpc_request *req = NULL;
378 struct obd_client_handle **och_p;
380 struct ll_file_data *fd;
384 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
385 inode->i_generation, inode, file->f_flags);
386 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_OPEN);
388 /* don't do anything for / */
389 if (inode->i_sb->s_root == file->f_dentry)
392 #ifdef LUSTRE_KERNEL_VERSION
395 it = file->private_data; /* XXX: compat macro */
396 file->private_data = NULL; /* prevent ll_local_open assertion */
399 fd = ll_file_data_get();
403 if (!it || !it->d.lustre.it_disposition) {
404 /* Convert f_flags into access mode. We cannot use file->f_mode,
405 * because everything but O_ACCMODE mask was stripped from it */
406 if ((oit.it_flags + 1) & O_ACCMODE)
408 if (file->f_flags & O_TRUNC)
409 oit.it_flags |= FMODE_WRITE;
411 /* kernel only call f_op->open in dentry_open. filp_open calls
412 * dentry_open after call to open_namei that checks permissions.
413 * Only nfsd_open call dentry_open directly without checking
414 * permissions and because of that this code below is safe. */
415 if (oit.it_flags & FMODE_WRITE)
416 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
418 /* We do not want O_EXCL here, presumably we opened the file
419 * already? XXX - NFS implications? */
420 oit.it_flags &= ~O_EXCL;
425 /* Let's see if we have file open on MDS already. */
426 if (it->it_flags & FMODE_WRITE) {
427 och_p = &lli->lli_mds_write_och;
428 och_usecount = &lli->lli_open_fd_write_count;
429 } else if (it->it_flags & FMODE_EXEC) {
430 och_p = &lli->lli_mds_exec_och;
431 och_usecount = &lli->lli_open_fd_exec_count;
433 och_p = &lli->lli_mds_read_och;
434 och_usecount = &lli->lli_open_fd_read_count;
437 LASSERTF(it->it_flags != 0, "f_it %p dist %d \n", file->f_it,
438 file->f_it->d.lustre.it_disposition);
440 down(&lli->lli_och_sem);
441 if (*och_p) { /* Open handle is present */
442 if (it_disposition(it, DISP_OPEN_OPEN)) {
443 /* Well, there's extra open request that we do not need,
444 let's close it somehow. This will decref request. */
445 ll_release_openhandle(file->f_dentry, it);
449 rc = ll_local_open(file, it, fd, NULL);
451 LASSERTF(rc == 0, "rc = %d\n", rc);
453 LASSERT(*och_usecount == 0);
454 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
456 ll_file_data_put(fd);
457 GOTO(out_och_free, rc = -ENOMEM);
460 if (!it->d.lustre.it_disposition) {
461 rc = ll_intent_file_open(file, NULL, 0, it);
463 ll_file_data_put(fd);
464 GOTO(out_och_free, rc);
467 /* Got some error? Release the request */
468 if (it->d.lustre.it_status < 0) {
469 req = it->d.lustre.it_data;
470 ptlrpc_req_finished(req);
472 mdc_set_lock_data(&it->d.lustre.it_lock_handle,
473 file->f_dentry->d_inode);
475 req = it->d.lustre.it_data;
477 /* mdc_intent_lock() didn't get a request ref if there was an
478 * open error, so don't do cleanup on the request here
480 /* XXX (green): Should not we bail out on any error here, not
481 * just open error? */
482 rc = it_open_error(DISP_OPEN_OPEN, it);
484 ll_file_data_put(fd);
485 GOTO(out_och_free, rc);
488 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
489 rc = ll_local_open(file, it, fd, *och_p);
490 LASSERTF(rc == 0, "rc = %d\n", rc);
492 up(&lli->lli_och_sem);
494 /* Must do this outside lli_och_sem lock to prevent deadlock where
495 different kind of OPEN lock for this same inode gets cancelled
496 by ldlm_cancel_lru */
497 if (!S_ISREG(inode->i_mode))
502 if (file->f_flags & O_LOV_DELAY_CREATE ||
503 !(file->f_mode & FMODE_WRITE)) {
504 CDEBUG(D_INODE, "object creation was delayed\n");
508 file->f_flags &= ~O_LOV_DELAY_CREATE;
511 ptlrpc_req_finished(req);
513 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
515 ll_open_complete(inode);
519 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
520 *och_p = NULL; /* OBD_FREE writes some magic there */
523 up(&lli->lli_och_sem);
528 /* Fills the obdo with the attributes for the inode defined by lsm */
529 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
532 struct ptlrpc_request_set *set;
533 struct obd_info oinfo = { { { 0 } } };
537 LASSERT(lsm != NULL);
539 memset(oa, 0, sizeof *oa);
542 oa->o_id = lsm->lsm_object_id;
543 oa->o_mode = S_IFREG;
544 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
545 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
548 set = ptlrpc_prep_set();
552 rc = obd_getattr_async(exp, &oinfo, set);
554 rc = ptlrpc_set_wait(set);
555 ptlrpc_set_destroy(set);
560 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
561 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
565 static inline void ll_remove_suid(struct inode *inode)
569 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
570 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
572 /* was any of the uid bits set? */
573 mode &= inode->i_mode;
574 if (mode && !capable(CAP_FSETID)) {
575 inode->i_mode &= ~mode;
576 // XXX careful here - we cannot change the size
580 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
582 struct ll_inode_info *lli = ll_i2info(inode);
583 struct lov_stripe_md *lsm = lli->lli_smd;
584 struct obd_export *exp = ll_i2obdexp(inode);
587 struct ldlm_lock *lock;
588 struct lov_stripe_md *lsm;
589 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
590 __u32 stripe, vallen = sizeof(stripe);
594 if (lsm->lsm_stripe_count == 1)
595 GOTO(check, stripe = 0);
597 /* get our offset in the lov */
598 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
600 CERROR("obd_get_info: rc = %d\n", rc);
603 LASSERT(stripe < lsm->lsm_stripe_count);
606 if (lsm->lsm_oinfo[stripe].loi_id != lock->l_resource->lr_name.name[0]||
607 lsm->lsm_oinfo[stripe].loi_gr != lock->l_resource->lr_name.name[1]){
608 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
609 lsm->lsm_oinfo[stripe].loi_id,
610 lsm->lsm_oinfo[stripe].loi_gr);
611 RETURN(-ELDLM_NO_LOCK_DATA);
617 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
618 * we get a lock cancellation for each stripe, so we have to map the obd's
619 * region back onto the stripes in the file that it held.
621 * No one can dirty the extent until we've finished our work and they can
622 * enqueue another lock. The DLM protects us from ll_file_read/write here,
623 * but other kernel actors could have pages locked.
625 * Called with the DLM lock held. */
626 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
627 struct ldlm_lock *lock, __u32 stripe)
629 ldlm_policy_data_t tmpex;
630 unsigned long start, end, count, skip, i, j;
632 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
633 struct lustre_handle lockh;
636 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
637 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
638 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
641 /* our locks are page granular thanks to osc_enqueue, we invalidate the
643 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
644 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
645 LDLM_ERROR(lock, "lock not aligned on CFS_PAGE_SIZE %lu", CFS_PAGE_SIZE);
646 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
647 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
651 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
652 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
653 if (lsm->lsm_stripe_count > 1) {
654 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
655 skip = (lsm->lsm_stripe_count - 1) * count;
656 start += start/count * skip + stripe * count;
658 end += end/count * skip + stripe * count;
660 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
663 i = inode->i_size ? (inode->i_size - 1) >> CFS_PAGE_SHIFT : 0;
667 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
668 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
669 count, skip, end, discard ? " (DISCARDING)" : "");
671 /* walk through the vmas on the inode and tear down mmaped pages that
672 * intersect with the lock. this stops immediately if there are no
673 * mmap()ed regions of the file. This is not efficient at all and
674 * should be short lived. We'll associate mmap()ed pages with the lock
675 * and will be able to find them directly */
676 for (i = start; i <= end; i += (j + skip)) {
677 j = min(count - (i % count), end - i + 1);
679 LASSERT(inode->i_mapping);
680 if (ll_teardown_mmaps(inode->i_mapping,
681 (__u64)i << CFS_PAGE_SHIFT,
682 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
686 /* this is the simplistic implementation of page eviction at
687 * cancelation. It is careful to get races with other page
688 * lockers handled correctly. fixes from bug 20 will make it
689 * more efficient by associating locks with pages and with
690 * batching writeback under the lock explicitly. */
691 for (i = start, j = start % count; i <= end;
692 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
694 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
700 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
701 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
702 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
705 if (!mapping_has_pages(inode->i_mapping)) {
706 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
712 page = find_get_page(inode->i_mapping, i);
715 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
716 i, tmpex.l_extent.start);
719 /* page->mapping to check with racing against teardown */
720 if (!discard && clear_page_dirty_for_io(page)) {
721 rc = ll_call_writepage(inode, page);
723 CERROR("writepage of page %p failed: %d\n",
725 /* either waiting for io to complete or reacquiring
726 * the lock that the failed writepage released */
730 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
731 /* check to see if another DLM lock covers this page b=2765 */
732 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
733 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
735 &lock->l_resource->lr_name, LDLM_EXTENT,
736 &tmpex, LCK_PR | LCK_PW, &lockh);
737 if (rc2 == 0 && page->mapping != NULL) {
738 struct ll_async_page *llap = llap_cast_private(page);
739 // checking again to account for writeback's lock_page()
740 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
742 ll_ra_accounting(llap, inode->i_mapping);
743 ll_truncate_complete_page(page);
746 page_cache_release(page);
748 LASSERTF(tmpex.l_extent.start <=
749 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
750 lock->l_policy_data.l_extent.end + 1),
751 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
752 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
757 static int ll_extent_lock_callback(struct ldlm_lock *lock,
758 struct ldlm_lock_desc *new, void *data,
761 struct lustre_handle lockh = { 0 };
765 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
766 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
771 case LDLM_CB_BLOCKING:
772 ldlm_lock2handle(lock, &lockh);
773 rc = ldlm_cli_cancel(&lockh);
775 CERROR("ldlm_cli_cancel failed: %d\n", rc);
777 case LDLM_CB_CANCELING: {
779 struct ll_inode_info *lli;
780 struct lov_stripe_md *lsm;
784 /* This lock wasn't granted, don't try to evict pages */
785 if (lock->l_req_mode != lock->l_granted_mode)
788 inode = ll_inode_from_lock(lock);
791 lli = ll_i2info(inode);
794 if (lli->lli_smd == NULL)
798 stripe = ll_lock_to_stripe_offset(inode, lock);
802 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
804 lov_stripe_lock(lsm);
805 lock_res_and_lock(lock);
806 kms = ldlm_extent_shift_kms(lock,
807 lsm->lsm_oinfo[stripe].loi_kms);
809 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
810 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
811 lsm->lsm_oinfo[stripe].loi_kms, kms);
812 lsm->lsm_oinfo[stripe].loi_kms = kms;
813 unlock_res_and_lock(lock);
814 lov_stripe_unlock(lsm);
815 //ll_try_done_writing(inode);
828 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
830 /* XXX ALLOCATE - 160 bytes */
831 struct inode *inode = ll_inode_from_lock(lock);
832 struct ll_inode_info *lli = ll_i2info(inode);
833 struct lustre_handle lockh = { 0 };
838 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
839 LDLM_FL_BLOCK_CONV)) {
840 LBUG(); /* not expecting any blocked async locks yet */
841 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
843 ldlm_lock_dump(D_OTHER, lock, 0);
844 ldlm_reprocess_all(lock->l_resource);
848 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
850 stripe = ll_lock_to_stripe_offset(inode, lock);
854 if (lock->l_lvb_len) {
855 struct lov_stripe_md *lsm = lli->lli_smd;
857 lvb = lock->l_lvb_data;
858 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
860 LOCK_INODE_MUTEX(inode);
861 lock_res_and_lock(lock);
862 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
863 kms = ldlm_extent_shift_kms(NULL, kms);
864 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
865 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
866 lsm->lsm_oinfo[stripe].loi_kms, kms);
867 lsm->lsm_oinfo[stripe].loi_kms = kms;
868 unlock_res_and_lock(lock);
869 UNLOCK_INODE_MUTEX(inode);
874 wake_up(&lock->l_waitq);
876 ldlm_lock2handle(lock, &lockh);
877 ldlm_lock_decref(&lockh, LCK_PR);
882 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
884 struct ptlrpc_request *req = reqp;
885 struct inode *inode = ll_inode_from_lock(lock);
886 struct ll_inode_info *lli;
887 struct lov_stripe_md *lsm;
890 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
894 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
895 lli = ll_i2info(inode);
897 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
900 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
902 /* First, find out which stripe index this lock corresponds to. */
903 stripe = ll_lock_to_stripe_offset(inode, lock);
905 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
907 rc = lustre_pack_reply(req, 2, size, NULL);
909 CERROR("lustre_pack_reply: %d\n", rc);
913 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
914 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
915 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
916 lvb->lvb_atime = LTIME_S(inode->i_atime);
917 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
919 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
920 " atime "LPU64", mtime "LPU64", ctime "LPU64,
921 inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
922 lvb->lvb_atime, lvb->lvb_ctime);
927 /* These errors are normal races, so we don't want to fill the console
928 * with messages by calling ptlrpc_error() */
929 if (rc == -ELDLM_NO_LOCK_DATA)
930 lustre_pack_reply(req, 1, NULL, NULL);
936 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
939 struct lustre_handle lockh = { 0 };
940 struct obd_enqueue_info einfo = { 0 };
941 struct obd_info oinfo = { { { 0 } } };
947 einfo.ei_type = LDLM_EXTENT;
948 einfo.ei_mode = LCK_PR;
949 einfo.ei_flags = LDLM_FL_HAS_INTENT;
950 einfo.ei_cb_bl = ll_extent_lock_callback;
951 einfo.ei_cb_cp = ldlm_completion_ast;
952 einfo.ei_cb_gl = ll_glimpse_callback;
953 einfo.ei_cbdata = NULL;
955 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
956 oinfo.oi_lockh = &lockh;
959 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
963 CERROR("obd_enqueue returned rc %d, "
964 "returning -EIO\n", rc);
965 RETURN(rc > 0 ? -EIO : rc);
968 lov_stripe_lock(lsm);
969 memset(&lvb, 0, sizeof(lvb));
970 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
971 st->st_size = lvb.lvb_size;
972 st->st_blocks = lvb.lvb_blocks;
973 st->st_mtime = lvb.lvb_mtime;
974 st->st_atime = lvb.lvb_atime;
975 st->st_ctime = lvb.lvb_ctime;
976 lov_stripe_unlock(lsm);
981 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
982 * file (because it prefers KMS over RSS when larger) */
983 int ll_glimpse_size(struct inode *inode, int ast_flags)
985 struct ll_inode_info *lli = ll_i2info(inode);
986 struct ll_sb_info *sbi = ll_i2sbi(inode);
987 struct lustre_handle lockh = { 0 };
988 struct obd_enqueue_info einfo = { 0 };
989 struct obd_info oinfo = { { { 0 } } };
994 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
997 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1001 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1002 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1003 * won't revoke any conflicting DLM locks held. Instead,
1004 * ll_glimpse_callback() will be called on each client
1005 * holding a DLM lock against this file, and resulting size
1006 * will be returned for each stripe. DLM lock on [0, EOF] is
1007 * acquired only if there were no conflicting locks. */
1008 einfo.ei_type = LDLM_EXTENT;
1009 einfo.ei_mode = LCK_PR;
1010 einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1011 einfo.ei_cb_bl = ll_extent_lock_callback;
1012 einfo.ei_cb_cp = ldlm_completion_ast;
1013 einfo.ei_cb_gl = ll_glimpse_callback;
1014 einfo.ei_cbdata = inode;
1016 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1017 oinfo.oi_lockh = &lockh;
1018 oinfo.oi_md = lli->lli_smd;
1020 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
1024 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1025 RETURN(rc > 0 ? -EIO : rc);
1028 ll_inode_size_lock(inode, 1);
1029 inode_init_lvb(inode, &lvb);
1030 obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1031 inode->i_size = lvb.lvb_size;
1032 inode->i_blocks = lvb.lvb_blocks;
1033 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1034 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1035 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1036 ll_inode_size_unlock(inode, 1);
1038 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1039 inode->i_size, inode->i_blocks);
1044 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1045 struct lov_stripe_md *lsm, int mode,
1046 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1049 struct ll_sb_info *sbi = ll_i2sbi(inode);
1051 struct obd_enqueue_info einfo = { 0 };
1052 struct obd_info oinfo = { { { 0 } } };
1056 LASSERT(!lustre_handle_is_used(lockh));
1057 LASSERT(lsm != NULL);
1059 /* don't drop the mmapped file to LRU */
1060 if (mapping_mapped(inode->i_mapping))
1061 ast_flags |= LDLM_FL_NO_LRU;
1063 /* XXX phil: can we do this? won't it screw the file size up? */
1064 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1065 (sbi->ll_flags & LL_SBI_NOLCK))
1068 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1069 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1071 einfo.ei_type = LDLM_EXTENT;
1072 einfo.ei_mode = mode;
1073 einfo.ei_flags = ast_flags;
1074 einfo.ei_cb_bl = ll_extent_lock_callback;
1075 einfo.ei_cb_cp = ldlm_completion_ast;
1076 einfo.ei_cb_gl = ll_glimpse_callback;
1077 einfo.ei_cbdata = inode;
1079 oinfo.oi_policy = *policy;
1080 oinfo.oi_lockh = lockh;
1083 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo);
1084 *policy = oinfo.oi_policy;
1088 ll_inode_size_lock(inode, 1);
1089 inode_init_lvb(inode, &lvb);
1090 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1092 if (policy->l_extent.start == 0 &&
1093 policy->l_extent.end == OBD_OBJECT_EOF) {
1094 /* vmtruncate()->ll_truncate() first sets the i_size and then
1095 * the kms under both a DLM lock and the
1096 * ll_inode_size_lock(). If we don't get the
1097 * ll_inode_size_lock() here we can match the DLM lock and
1098 * reset i_size from the kms before the truncating path has
1099 * updated the kms. generic_file_write can then trust the
1100 * stale i_size when doing appending writes and effectively
1101 * cancel the result of the truncate. Getting the
1102 * ll_inode_size_lock() after the enqueue maintains the DLM
1103 * -> ll_inode_size_lock() acquiring order. */
1104 inode->i_size = lvb.lvb_size;
1105 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1106 inode->i_ino, inode->i_size);
1110 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1111 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1112 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1114 ll_inode_size_unlock(inode, 1);
1119 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1120 struct lov_stripe_md *lsm, int mode,
1121 struct lustre_handle *lockh)
1123 struct ll_sb_info *sbi = ll_i2sbi(inode);
1127 /* XXX phil: can we do this? won't it screw the file size up? */
1128 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1129 (sbi->ll_flags & LL_SBI_NOLCK))
1132 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
1137 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1140 struct inode *inode = file->f_dentry->d_inode;
1141 struct ll_inode_info *lli = ll_i2info(inode);
1142 struct lov_stripe_md *lsm = lli->lli_smd;
1143 struct ll_sb_info *sbi = ll_i2sbi(inode);
1144 struct ll_lock_tree tree;
1145 struct ll_lock_tree_node *node;
1147 struct ll_ra_read bead;
1150 ssize_t retval, chunk, sum = 0;
1154 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1155 inode->i_ino, inode->i_generation, inode, count, *ppos);
1156 ll_vfs_ops_tally(sbi, VFS_OPS_READ);
1158 /* "If nbyte is 0, read() will return 0 and have no other results."
1159 * -- Single Unix Spec */
1163 lprocfs_counter_add(sbi->ll_stats, LPROC_LL_READ_BYTES, count);
1166 /* Read on file with no objects should return zero-filled
1167 * buffers up to file size (we can get non-zero sizes with
1168 * mknod + truncate, then opening file for read. This is a
1169 * common pattern in NFS case, it seems). Bug 6243 */
1171 /* Since there are no objects on OSTs, we have nothing to get
1172 * lock on and so we are forced to access inode->i_size
1175 /* Read beyond end of file */
1176 if (*ppos >= inode->i_size)
1179 if (count > inode->i_size - *ppos)
1180 count = inode->i_size - *ppos;
1181 /* Make sure to correctly adjust the file pos pointer for
1183 notzeroed = clear_user(buf, count);
1192 if (sbi->ll_max_rw_chunk != 0) {
1193 /* first, let's know the end of the current stripe */
1195 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1198 /* correct, the end is beyond the request */
1199 if (end > *ppos + count - 1)
1200 end = *ppos + count - 1;
1202 /* and chunk shouldn't be too large even if striping is wide */
1203 if (end - *ppos > sbi->ll_max_rw_chunk)
1204 end = *ppos + sbi->ll_max_rw_chunk - 1;
1206 end = *ppos + count - 1;
1209 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1210 tree.lt_fd = LUSTRE_FPRIVATE(file);
1211 rc = ll_tree_lock(&tree, node, buf, count,
1212 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1214 GOTO(out, retval = rc);
1216 ll_inode_size_lock(inode, 1);
1218 * Consistency guarantees: following possibilities exist for the
1219 * relation between region being read and real file size at this
1222 * (A): the region is completely inside of the file;
1224 * (B-x): x bytes of region are inside of the file, the rest is
1227 * (C): the region is completely outside of the file.
1229 * This classification is stable under DLM lock acquired by
1230 * ll_tree_lock() above, because to change class, other client has to
1231 * take DLM lock conflicting with our lock. Also, any updates to
1232 * ->i_size by other threads on this client are serialized by
1233 * ll_inode_size_lock(). This guarantees that short reads are handled
1234 * correctly in the face of concurrent writes and truncates.
1236 inode_init_lvb(inode, &lvb);
1237 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1239 if (*ppos + count - 1 > kms) {
1240 /* A glimpse is necessary to determine whether we return a
1241 * short read (B) or some zeroes at the end of the buffer (C) */
1242 ll_inode_size_unlock(inode, 1);
1243 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1245 ll_tree_unlock(&tree);
1249 /* region is within kms and, hence, within real file size (A).
1250 * We need to increase i_size to cover the read region so that
1251 * generic_file_read() will do its job, but that doesn't mean
1252 * the kms size is _correct_, it is only the _minimum_ size.
1253 * If someone does a stat they will get the correct size which
1254 * will always be >= the kms value here. b=11081 */
1255 if (inode->i_size < kms)
1256 inode->i_size = kms;
1257 ll_inode_size_unlock(inode, 1);
1260 chunk = end - *ppos + 1;
1261 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1262 inode->i_ino, chunk, *ppos, inode->i_size);
1264 /* turn off the kernel's read-ahead */
1265 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1268 file->f_ra.ra_pages = 0;
1270 /* initialize read-ahead window once per syscall */
1273 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1274 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1275 ll_ra_read_in(file, &bead);
1279 file_accessed(file);
1280 retval = generic_file_read(file, buf, chunk, ppos);
1281 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 0);
1283 ll_tree_unlock(&tree);
1289 if (retval == chunk && count > 0)
1295 ll_ra_read_ex(file, &bead);
1296 retval = (sum > 0) ? sum : retval;
1301 * Write to a file (through the page cache).
1303 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1306 struct inode *inode = file->f_dentry->d_inode;
1307 struct ll_sb_info *sbi = ll_i2sbi(inode);
1308 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1309 struct ll_lock_tree tree;
1310 struct ll_lock_tree_node *node;
1311 loff_t maxbytes = ll_file_maxbytes(inode);
1312 loff_t lock_start, lock_end, end;
1313 ssize_t retval, chunk, sum = 0;
1317 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1318 inode->i_ino, inode->i_generation, inode, count, *ppos);
1319 ll_vfs_ops_tally(sbi, VFS_OPS_WRITE);
1321 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1323 /* POSIX, but surprised the VFS doesn't check this already */
1327 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1328 * called on the file, don't fail the below assertion (bug 2388). */
1329 if (file->f_flags & O_LOV_DELAY_CREATE &&
1330 ll_i2info(inode)->lli_smd == NULL)
1333 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1335 down(&ll_i2info(inode)->lli_write_sem);
1338 chunk = 0; /* just to fix gcc's warning */
1339 end = *ppos + count - 1;
1341 if (file->f_flags & O_APPEND) {
1343 lock_end = OBD_OBJECT_EOF;
1344 } else if (sbi->ll_max_rw_chunk != 0) {
1345 /* first, let's know the end of the current stripe */
1347 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1350 /* correct, the end is beyond the request */
1351 if (end > *ppos + count - 1)
1352 end = *ppos + count - 1;
1354 /* and chunk shouldn't be too large even if striping is wide */
1355 if (end - *ppos > sbi->ll_max_rw_chunk)
1356 end = *ppos + sbi->ll_max_rw_chunk - 1;
1361 lock_end = *ppos + count - 1;
1363 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1366 GOTO(out, retval = PTR_ERR(node));
1368 tree.lt_fd = LUSTRE_FPRIVATE(file);
1369 rc = ll_tree_lock(&tree, node, buf, count,
1370 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1372 GOTO(out, retval = rc);
1374 /* This is ok, g_f_w will overwrite this under i_sem if it races
1375 * with a local truncate, it just makes our maxbyte checking easier.
1376 * The i_size value gets updated in ll_extent_lock() as a consequence
1377 * of the [0,EOF] extent lock we requested above. */
1378 if (file->f_flags & O_APPEND) {
1379 *ppos = inode->i_size;
1380 end = *ppos + count - 1;
1383 if (*ppos >= maxbytes) {
1384 send_sig(SIGXFSZ, current, 0);
1385 GOTO(out, retval = -EFBIG);
1387 if (*ppos + count > maxbytes)
1388 count = maxbytes - *ppos;
1390 /* generic_file_write handles O_APPEND after getting i_mutex */
1391 chunk = end - *ppos + 1;
1392 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1393 inode->i_ino, chunk, *ppos);
1394 retval = generic_file_write(file, buf, chunk, ppos);
1395 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1398 ll_tree_unlock(&tree);
1404 if (retval == chunk && count > 0)
1408 up(&ll_i2info(inode)->lli_write_sem);
1410 retval = (sum > 0) ? sum : retval;
1411 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
1412 retval > 0 ? retval : 0);
1414 if (retval > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1415 rc = ll_sync_page_range(inode, inode->i_mapping, *ppos - retval,
1425 * Send file content (through pagecache) somewhere with helper
1427 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1428 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1429 read_actor_t actor, void *target)
1431 struct inode *inode = in_file->f_dentry->d_inode;
1432 struct ll_inode_info *lli = ll_i2info(inode);
1433 struct lov_stripe_md *lsm = lli->lli_smd;
1434 struct ll_lock_tree tree;
1435 struct ll_lock_tree_node *node;
1437 struct ll_ra_read bead;
1442 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1443 inode->i_ino, inode->i_generation, inode, count, *ppos);
1445 /* "If nbyte is 0, read() will return 0 and have no other results."
1446 * -- Single Unix Spec */
1450 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1453 /* turn off the kernel's read-ahead */
1454 in_file->f_ra.ra_pages = 0;
1456 /* File with no objects, nothing to lock */
1458 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1460 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1461 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1462 rc = ll_tree_lock(&tree, node, NULL, count,
1463 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1467 ll_inode_size_lock(inode, 1);
1469 * Consistency guarantees: following possibilities exist for the
1470 * relation between region being read and real file size at this
1473 * (A): the region is completely inside of the file;
1475 * (B-x): x bytes of region are inside of the file, the rest is
1478 * (C): the region is completely outside of the file.
1480 * This classification is stable under DLM lock acquired by
1481 * ll_tree_lock() above, because to change class, other client has to
1482 * take DLM lock conflicting with our lock. Also, any updates to
1483 * ->i_size by other threads on this client are serialized by
1484 * ll_inode_size_lock(). This guarantees that short reads are handled
1485 * correctly in the face of concurrent writes and truncates.
1487 inode_init_lvb(inode, &lvb);
1488 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1490 if (*ppos + count - 1 > kms) {
1491 /* A glimpse is necessary to determine whether we return a
1492 * short read (B) or some zeroes at the end of the buffer (C) */
1493 ll_inode_size_unlock(inode, 1);
1494 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1498 /* region is within kms and, hence, within real file size (A) */
1499 inode->i_size = kms;
1500 ll_inode_size_unlock(inode, 1);
1503 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1504 inode->i_ino, count, *ppos, inode->i_size);
1506 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1507 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1508 ll_ra_read_in(in_file, &bead);
1510 file_accessed(in_file);
1511 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1512 ll_ra_read_ex(in_file, &bead);
1515 ll_tree_unlock(&tree);
1520 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1523 struct ll_inode_info *lli = ll_i2info(inode);
1524 struct obd_export *exp = ll_i2obdexp(inode);
1525 struct ll_recreate_obj ucreatp;
1526 struct obd_trans_info oti = { 0 };
1527 struct obdo *oa = NULL;
1530 struct lov_stripe_md *lsm, *lsm2;
1533 if (!capable (CAP_SYS_ADMIN))
1536 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1537 sizeof(struct ll_recreate_obj));
1545 down(&lli->lli_open_sem);
1548 GOTO(out, rc = -ENOENT);
1549 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1550 (lsm->lsm_stripe_count));
1552 OBD_ALLOC(lsm2, lsm_size);
1554 GOTO(out, rc = -ENOMEM);
1556 oa->o_id = ucreatp.lrc_id;
1557 oa->o_nlink = ucreatp.lrc_ost_idx;
1558 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1559 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
1560 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1561 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1563 oti.oti_objid = NULL;
1564 memcpy(lsm2, lsm, lsm_size);
1565 rc = obd_create(exp, oa, &lsm2, &oti);
1567 OBD_FREE(lsm2, lsm_size);
1570 up(&lli->lli_open_sem);
1575 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1576 int flags, struct lov_user_md *lum,
1579 struct ll_inode_info *lli = ll_i2info(inode);
1580 struct lov_stripe_md *lsm;
1581 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1585 down(&lli->lli_open_sem);
1588 up(&lli->lli_open_sem);
1589 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1594 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1597 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1598 GOTO(out_req_free, rc = -ENOENT);
1599 rc = oit.d.lustre.it_status;
1601 GOTO(out_req_free, rc);
1603 ll_release_openhandle(file->f_dentry, &oit);
1606 up(&lli->lli_open_sem);
1607 ll_intent_release(&oit);
1610 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1614 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1615 struct lov_mds_md **lmmp, int *lmm_size,
1616 struct ptlrpc_request **request)
1618 struct ll_sb_info *sbi = ll_i2sbi(inode);
1620 struct mds_body *body;
1621 struct lov_mds_md *lmm = NULL;
1622 struct ptlrpc_request *req = NULL;
1625 ll_inode2fid(&fid, inode);
1627 rc = ll_get_max_mdsize(sbi, &lmmsize);
1631 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
1632 filename, strlen(filename) + 1,
1633 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
1636 CDEBUG(D_INFO, "mdc_getattr_name failed "
1637 "on %s: rc %d\n", filename, rc);
1641 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1643 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1644 /* swabbed by mdc_getattr_name */
1645 LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1647 lmmsize = body->eadatasize;
1649 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1651 GOTO(out, rc = -ENODATA);
1654 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
1656 LASSERT(lmm != NULL);
1657 LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1660 * This is coming from the MDS, so is probably in
1661 * little endian. We convert it to host endian before
1662 * passing it to userspace.
1664 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1665 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1666 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1667 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1668 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1671 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1672 struct lov_stripe_md *lsm;
1673 struct lov_user_md_join *lmj;
1674 int lmj_size, i, aindex = 0;
1676 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1678 GOTO(out, rc = -ENOMEM);
1679 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1681 GOTO(out_free_memmd, rc);
1683 lmj_size = sizeof(struct lov_user_md_join) +
1684 lsm->lsm_stripe_count *
1685 sizeof(struct lov_user_ost_data_join);
1686 OBD_ALLOC(lmj, lmj_size);
1688 GOTO(out_free_memmd, rc = -ENOMEM);
1690 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1691 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1692 struct lov_extent *lex =
1693 &lsm->lsm_array->lai_ext_array[aindex];
1695 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1697 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1698 LPU64" len %d\n", aindex, i,
1699 lex->le_start, (int)lex->le_len);
1700 lmj->lmm_objects[i].l_extent_start =
1703 if ((int)lex->le_len == -1)
1704 lmj->lmm_objects[i].l_extent_end = -1;
1706 lmj->lmm_objects[i].l_extent_end =
1707 lex->le_start + lex->le_len;
1708 lmj->lmm_objects[i].l_object_id =
1709 lsm->lsm_oinfo[i].loi_id;
1710 lmj->lmm_objects[i].l_object_gr =
1711 lsm->lsm_oinfo[i].loi_gr;
1712 lmj->lmm_objects[i].l_ost_gen =
1713 lsm->lsm_oinfo[i].loi_ost_gen;
1714 lmj->lmm_objects[i].l_ost_idx =
1715 lsm->lsm_oinfo[i].loi_ost_idx;
1717 lmm = (struct lov_mds_md *)lmj;
1720 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1724 *lmm_size = lmmsize;
1728 static int ll_lov_setea(struct inode *inode, struct file *file,
1731 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1732 struct lov_user_md *lump;
1733 int lum_size = sizeof(struct lov_user_md) +
1734 sizeof(struct lov_user_ost_data);
1738 if (!capable (CAP_SYS_ADMIN))
1741 OBD_ALLOC(lump, lum_size);
1745 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1747 OBD_FREE(lump, lum_size);
1751 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1753 OBD_FREE(lump, lum_size);
1757 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1760 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1762 int flags = FMODE_WRITE;
1765 /* Bug 1152: copy properly when this is no longer true */
1766 LASSERT(sizeof(lum) == sizeof(*lump));
1767 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1768 rc = copy_from_user(&lum, lump, sizeof(lum));
1772 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1774 put_user(0, &lump->lmm_stripe_count);
1775 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
1776 0, ll_i2info(inode)->lli_smd, lump);
1781 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1783 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1788 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
1792 static int ll_get_grouplock(struct inode *inode, struct file *file,
1795 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1796 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1797 .end = OBD_OBJECT_EOF}};
1798 struct lustre_handle lockh = { 0 };
1799 struct ll_inode_info *lli = ll_i2info(inode);
1800 struct lov_stripe_md *lsm = lli->lli_smd;
1804 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1808 policy.l_extent.gid = arg;
1809 if (file->f_flags & O_NONBLOCK)
1810 flags = LDLM_FL_BLOCK_NOWAIT;
1812 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1816 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1818 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1823 static int ll_put_grouplock(struct inode *inode, struct file *file,
1826 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1827 struct ll_inode_info *lli = ll_i2info(inode);
1828 struct lov_stripe_md *lsm = lli->lli_smd;
1832 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1833 /* Ugh, it's already unlocked. */
1837 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1840 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1842 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1847 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1852 static int join_sanity_check(struct inode *head, struct inode *tail)
1855 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1856 CERROR("server do not support join \n");
1859 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1860 CERROR("tail ino %lu and ino head %lu must be regular\n",
1861 head->i_ino, tail->i_ino);
1864 if (head->i_ino == tail->i_ino) {
1865 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1868 if (head->i_size % JOIN_FILE_ALIGN) {
1869 CERROR("hsize %llu must be times of 64K\n", head->i_size);
1875 static int join_file(struct inode *head_inode, struct file *head_filp,
1876 struct file *tail_filp)
1878 struct inode *tail_inode, *tail_parent;
1879 struct dentry *tail_dentry = tail_filp->f_dentry;
1880 struct lookup_intent oit = {.it_op = IT_OPEN,
1881 .it_flags = head_filp->f_flags|O_JOIN_FILE};
1882 struct lustre_handle lockh;
1883 struct mdc_op_data *op_data;
1884 __u32 hsize = head_inode->i_size >> 32;
1885 __u32 tsize = head_inode->i_size;
1889 tail_dentry = tail_filp->f_dentry;
1890 tail_inode = tail_dentry->d_inode;
1891 tail_parent = tail_dentry->d_parent->d_inode;
1893 OBD_ALLOC_PTR(op_data);
1894 if (op_data == NULL) {
1898 ll_prepare_mdc_op_data(op_data, head_inode, tail_parent,
1899 tail_dentry->d_name.name,
1900 tail_dentry->d_name.len, 0);
1901 rc = mdc_enqueue(ll_i2mdcexp(head_inode), LDLM_IBITS, &oit, LCK_PW,
1902 op_data, &lockh, &tsize, 0, ldlm_completion_ast,
1903 ll_mdc_blocking_ast, &hsize, 0);
1908 rc = oit.d.lustre.it_status;
1911 ptlrpc_req_finished((struct ptlrpc_request *)
1912 oit.d.lustre.it_data);
1916 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1918 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1919 oit.d.lustre.it_lock_mode = 0;
1921 ll_release_openhandle(head_filp->f_dentry, &oit);
1924 OBD_FREE_PTR(op_data);
1925 ll_intent_release(&oit);
1929 static int ll_file_join(struct inode *head, struct file *filp,
1930 char *filename_tail)
1932 struct inode *tail = NULL, *first = NULL, *second = NULL;
1933 struct dentry *tail_dentry;
1934 struct file *tail_filp, *first_filp, *second_filp;
1935 struct ll_lock_tree first_tree, second_tree;
1936 struct ll_lock_tree_node *first_node, *second_node;
1937 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1938 int rc = 0, cleanup_phase = 0;
1941 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1942 head->i_ino, head->i_generation, head, filename_tail);
1944 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1945 if (IS_ERR(tail_filp)) {
1946 CERROR("Can not open tail file %s", filename_tail);
1947 rc = PTR_ERR(tail_filp);
1950 tail = igrab(tail_filp->f_dentry->d_inode);
1952 tlli = ll_i2info(tail);
1953 tail_dentry = tail_filp->f_dentry;
1954 LASSERT(tail_dentry);
1957 /*reorder the inode for lock sequence*/
1958 first = head->i_ino > tail->i_ino ? head : tail;
1959 second = head->i_ino > tail->i_ino ? tail : head;
1960 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1961 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1963 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1964 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1965 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1966 if (IS_ERR(first_node)){
1967 rc = PTR_ERR(first_node);
1970 first_tree.lt_fd = first_filp->private_data;
1971 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1976 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1977 if (IS_ERR(second_node)){
1978 rc = PTR_ERR(second_node);
1981 second_tree.lt_fd = second_filp->private_data;
1982 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1987 rc = join_sanity_check(head, tail);
1991 rc = join_file(head, filp, tail_filp);
1995 switch (cleanup_phase) {
1997 ll_tree_unlock(&second_tree);
1998 obd_cancel_unused(ll_i2obdexp(second),
1999 ll_i2info(second)->lli_smd, 0, NULL);
2001 ll_tree_unlock(&first_tree);
2002 obd_cancel_unused(ll_i2obdexp(first),
2003 ll_i2info(first)->lli_smd, 0, NULL);
2005 filp_close(tail_filp, 0);
2008 if (head && rc == 0) {
2009 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2011 hlli->lli_smd = NULL;
2016 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2022 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2024 struct inode *inode = dentry->d_inode;
2025 struct obd_client_handle *och;
2031 /* Root ? Do nothing. */
2032 if (dentry->d_inode->i_sb->s_root == dentry)
2035 /* No open handle to close? Move away */
2036 if (!it_disposition(it, DISP_OPEN_OPEN))
2039 OBD_ALLOC(och, sizeof(*och));
2041 GOTO(out, rc = -ENOMEM);
2043 ll_och_fill(ll_i2info(inode), it, och);
2045 rc = ll_close_inode_openhandle(inode, och);
2047 OBD_FREE(och, sizeof(*och));
2049 /* this one is in place of ll_file_open */
2050 ptlrpc_req_finished(it->d.lustre.it_data);
2051 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2055 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2058 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2062 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2063 inode->i_generation, inode, cmd);
2064 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_IOCTL);
2066 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2067 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2070 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
2072 case LL_IOC_GETFLAGS:
2073 /* Get the current value of the file flags */
2074 return put_user(fd->fd_flags, (int *)arg);
2075 case LL_IOC_SETFLAGS:
2076 case LL_IOC_CLRFLAGS:
2077 /* Set or clear specific file flags */
2078 /* XXX This probably needs checks to ensure the flags are
2079 * not abused, and to handle any flag side effects.
2081 if (get_user(flags, (int *) arg))
2084 if (cmd == LL_IOC_SETFLAGS) {
2085 if ((flags & LL_FILE_IGNORE_LOCK) &&
2086 !(file->f_flags & O_DIRECT)) {
2087 CERROR("%s: unable to disable locking on "
2088 "non-O_DIRECT file\n", current->comm);
2092 fd->fd_flags |= flags;
2094 fd->fd_flags &= ~flags;
2097 case LL_IOC_LOV_SETSTRIPE:
2098 RETURN(ll_lov_setstripe(inode, file, arg));
2099 case LL_IOC_LOV_SETEA:
2100 RETURN(ll_lov_setea(inode, file, arg));
2101 case LL_IOC_LOV_GETSTRIPE:
2102 RETURN(ll_lov_getstripe(inode, arg));
2103 case LL_IOC_RECREATE_OBJ:
2104 RETURN(ll_lov_recreate_obj(inode, file, arg));
2105 case EXT3_IOC_GETFLAGS:
2106 case EXT3_IOC_SETFLAGS:
2107 RETURN(ll_iocontrol(inode, file, cmd, arg));
2108 case EXT3_IOC_GETVERSION_OLD:
2109 case EXT3_IOC_GETVERSION:
2110 RETURN(put_user(inode->i_generation, (int *)arg));
2115 ftail = getname((const char *)arg);
2117 RETURN(PTR_ERR(ftail));
2118 rc = ll_file_join(inode, file, ftail);
2122 case LL_IOC_GROUP_LOCK:
2123 RETURN(ll_get_grouplock(inode, file, arg));
2124 case LL_IOC_GROUP_UNLOCK:
2125 RETURN(ll_put_grouplock(inode, file, arg));
2126 case IOC_OBD_STATFS:
2127 RETURN(ll_obd_statfs(inode, (void *)arg));
2129 /* We need to special case any other ioctls we want to handle,
2130 * to send them to the MDS/OST as appropriate and to properly
2131 * network encode the arg field.
2132 case EXT3_IOC_SETVERSION_OLD:
2133 case EXT3_IOC_SETVERSION:
2136 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2141 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2143 struct inode *inode = file->f_dentry->d_inode;
2144 struct ll_inode_info *lli = ll_i2info(inode);
2145 struct lov_stripe_md *lsm = lli->lli_smd;
2148 retval = offset + ((origin == 2) ? inode->i_size :
2149 (origin == 1) ? file->f_pos : 0);
2150 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2151 inode->i_ino, inode->i_generation, inode, retval, retval,
2152 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2153 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_SEEK);
2154 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
2156 if (origin == 2) { /* SEEK_END */
2157 int nonblock = 0, rc;
2159 if (file->f_flags & O_NONBLOCK)
2160 nonblock = LDLM_FL_BLOCK_NOWAIT;
2163 rc = ll_glimpse_size(inode, nonblock);
2168 ll_inode_size_lock(inode, 0);
2169 offset += inode->i_size;
2170 ll_inode_size_unlock(inode, 0);
2171 } else if (origin == 1) { /* SEEK_CUR */
2172 offset += file->f_pos;
2176 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2177 if (offset != file->f_pos) {
2178 file->f_pos = offset;
2179 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2181 file->f_version = ++event;
2190 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2192 struct inode *inode = dentry->d_inode;
2193 struct ll_inode_info *lli = ll_i2info(inode);
2194 struct lov_stripe_md *lsm = lli->lli_smd;
2196 struct ptlrpc_request *req;
2199 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2200 inode->i_generation, inode);
2201 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_FSYNC);
2202 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
2204 /* fsync's caller has already called _fdata{sync,write}, we want
2205 * that IO to finish before calling the osc and mdc sync methods */
2206 rc = filemap_fdatawait(inode->i_mapping);
2208 /* catch async errors that were recorded back when async writeback
2209 * failed for pages in this mapping. */
2210 err = lli->lli_async_rc;
2211 lli->lli_async_rc = 0;
2215 err = lov_test_and_clear_async_rc(lsm);
2220 ll_inode2fid(&fid, inode);
2221 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
2225 ptlrpc_req_finished(req);
2228 struct obdo *oa = obdo_alloc();
2231 RETURN(rc ? rc : -ENOMEM);
2233 oa->o_id = lsm->lsm_object_id;
2234 oa->o_valid = OBD_MD_FLID;
2235 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2236 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2238 err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
2248 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2250 struct inode *inode = file->f_dentry->d_inode;
2251 struct ll_sb_info *sbi = ll_i2sbi(inode);
2252 struct ldlm_res_id res_id =
2253 { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
2254 struct lustre_handle lockh = {0};
2255 ldlm_policy_data_t flock;
2256 ldlm_mode_t mode = 0;
2261 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2262 inode->i_ino, file_lock);
2263 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_FLOCK);
2265 if (file_lock->fl_flags & FL_FLOCK) {
2266 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2267 /* set missing params for flock() calls */
2268 file_lock->fl_end = OFFSET_MAX;
2269 file_lock->fl_pid = current->tgid;
2271 flock.l_flock.pid = file_lock->fl_pid;
2272 flock.l_flock.start = file_lock->fl_start;
2273 flock.l_flock.end = file_lock->fl_end;
2275 switch (file_lock->fl_type) {
2280 /* An unlock request may or may not have any relation to
2281 * existing locks so we may not be able to pass a lock handle
2282 * via a normal ldlm_lock_cancel() request. The request may even
2283 * unlock a byte range in the middle of an existing lock. In
2284 * order to process an unlock request we need all of the same
2285 * information that is given with a normal read or write record
2286 * lock request. To avoid creating another ldlm unlock (cancel)
2287 * message we'll treat a LCK_NL flock request as an unlock. */
2294 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2309 flags = LDLM_FL_BLOCK_NOWAIT;
2315 flags = LDLM_FL_TEST_LOCK;
2316 /* Save the old mode so that if the mode in the lock changes we
2317 * can decrement the appropriate reader or writer refcount. */
2318 file_lock->fl_type = mode;
2321 CERROR("unknown fcntl lock command: %d\n", cmd);
2325 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2326 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2327 flags, mode, flock.l_flock.start, flock.l_flock.end);
2329 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, res_id,
2330 LDLM_FLOCK, &flock, mode, &flags, NULL,
2331 ldlm_flock_completion_ast, NULL, file_lock,
2332 NULL, 0, NULL, &lockh, 0);
2333 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2334 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2335 #ifdef HAVE_F_OP_FLOCK
2336 if ((file_lock->fl_flags & FL_POSIX) &&(rc == 0))
2337 posix_lock_file_wait(file, file_lock);
2343 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2350 int ll_have_md_lock(struct inode *inode, __u64 bits)
2352 struct lustre_handle lockh;
2353 struct ldlm_res_id res_id = { .name = {0} };
2354 struct obd_device *obddev;
2355 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2362 obddev = ll_i2mdcexp(inode)->exp_obd;
2363 res_id.name[0] = inode->i_ino;
2364 res_id.name[1] = inode->i_generation;
2366 CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
2368 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2369 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
2370 &policy, LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2377 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2378 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2379 * and return success */
2381 /* This path cannot be hit for regular files unless in
2382 * case of obscure races, so no need to to validate
2384 if (!S_ISREG(inode->i_mode) &&
2385 !S_ISDIR(inode->i_mode))
2390 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2398 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2400 struct inode *inode = dentry->d_inode;
2401 struct ptlrpc_request *req = NULL;
2402 struct obd_export *exp;
2407 CERROR("REPORT THIS LINE TO PETER\n");
2410 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2411 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2412 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2413 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
2416 exp = ll_i2mdcexp(inode);
2418 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2419 struct lookup_intent oit = { .it_op = IT_GETATTR };
2420 struct mdc_op_data op_data;
2422 /* Call getattr by fid, so do not provide name at all. */
2423 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
2424 dentry->d_inode, NULL, 0, 0);
2425 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
2426 /* we are not interested in name
2429 ll_mdc_blocking_ast, 0);
2431 rc = ll_inode_revalidate_fini(inode, rc);
2435 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2437 ll_intent_release(&oit);
2441 /* Unlinked? Unhash dentry, so it is not picked up later by
2442 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2443 here to preserve get_cwd functionality on 2.6.
2445 if (!dentry->d_inode->i_nlink) {
2446 spin_lock(&dcache_lock);
2447 ll_drop_dentry(dentry);
2448 spin_unlock(&dcache_lock);
2451 ll_lookup_finish_locks(&oit, dentry);
2452 } else if (!ll_have_md_lock(dentry->d_inode,
2453 MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
2454 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2456 obd_valid valid = OBD_MD_FLGETATTR;
2459 if (S_ISREG(inode->i_mode)) {
2460 rc = ll_get_max_mdsize(sbi, &ealen);
2463 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2465 ll_inode2fid(&fid, inode);
2466 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
2468 rc = ll_inode_revalidate_fini(inode, rc);
2472 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
2478 /* if object not yet allocated, don't validate size */
2479 if (ll_i2info(inode)->lli_smd == NULL)
2482 /* ll_glimpse_size will prefer locally cached writes if they extend
2484 rc = ll_glimpse_size(inode, 0);
2487 ptlrpc_req_finished(req);
2491 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2492 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2493 struct lookup_intent *it, struct kstat *stat)
2495 struct inode *inode = de->d_inode;
2498 res = ll_inode_revalidate_it(de, it);
2499 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
2504 stat->dev = inode->i_sb->s_dev;
2505 stat->ino = inode->i_ino;
2506 stat->mode = inode->i_mode;
2507 stat->nlink = inode->i_nlink;
2508 stat->uid = inode->i_uid;
2509 stat->gid = inode->i_gid;
2510 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2511 stat->atime = inode->i_atime;
2512 stat->mtime = inode->i_mtime;
2513 stat->ctime = inode->i_ctime;
2514 #ifdef HAVE_INODE_BLKSIZE
2515 stat->blksize = inode->i_blksize;
2517 stat->blksize = 1<<inode->i_blkbits;
2520 ll_inode_size_lock(inode, 0);
2521 stat->size = inode->i_size;
2522 stat->blocks = inode->i_blocks;
2523 ll_inode_size_unlock(inode, 0);
2527 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2529 struct lookup_intent it = { .it_op = IT_GETATTR };
2531 ll_vfs_ops_tally(ll_i2sbi(de->d_inode), VFS_OPS_GETATTR);
2532 return ll_getattr_it(mnt, de, &it, stat);
2537 int lustre_check_acl(struct inode *inode, int mask)
2539 #ifdef CONFIG_FS_POSIX_ACL
2540 struct ll_inode_info *lli = ll_i2info(inode);
2541 struct posix_acl *acl;
2545 spin_lock(&lli->lli_lock);
2546 acl = posix_acl_dup(lli->lli_posix_acl);
2547 spin_unlock(&lli->lli_lock);
2552 rc = posix_acl_permission(inode, acl, mask);
2553 posix_acl_release(acl);
2561 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2562 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2564 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2565 inode->i_ino, inode->i_generation, inode, mask);
2567 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_INODE_PERMISSION);
2568 return generic_permission(inode, mask, lustre_check_acl);
2571 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2572 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2574 int ll_inode_permission(struct inode *inode, int mask)
2577 int mode = inode->i_mode;
2580 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2581 inode->i_ino, inode->i_generation, inode, mask);
2582 ll_vfs_ops_tally(ll_i2sbi(inode), VFS_OPS_INODE_PERMISSION);
2584 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2585 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2587 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2589 if (current->fsuid == inode->i_uid) {
2592 if (((mode >> 3) & mask & S_IRWXO) != mask)
2594 rc = lustre_check_acl(inode, mask);
2598 goto check_capabilities;
2602 if (in_group_p(inode->i_gid))
2605 if ((mode & mask & S_IRWXO) == mask)
2609 if (!(mask & MAY_EXEC) ||
2610 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2611 if (capable(CAP_DAC_OVERRIDE))
2614 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2615 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2622 struct file_operations ll_file_operations = {
2623 .read = ll_file_read,
2624 .write = ll_file_write,
2625 .ioctl = ll_file_ioctl,
2626 .open = ll_file_open,
2627 .release = ll_file_release,
2628 .mmap = ll_file_mmap,
2629 .llseek = ll_file_seek,
2630 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2631 .sendfile = ll_file_sendfile,
2634 #ifdef HAVE_F_OP_FLOCK
2635 .flock = ll_file_noflock,
2637 .lock = ll_file_noflock
2640 struct file_operations ll_file_operations_flock = {
2641 .read = ll_file_read,
2642 .write = ll_file_write,
2643 .ioctl = ll_file_ioctl,
2644 .open = ll_file_open,
2645 .release = ll_file_release,
2646 .mmap = ll_file_mmap,
2647 .llseek = ll_file_seek,
2648 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2649 .sendfile = ll_file_sendfile,
2652 #ifdef HAVE_F_OP_FLOCK
2653 .flock = ll_file_flock,
2655 .lock = ll_file_flock
2659 struct inode_operations ll_file_inode_operations = {
2660 #ifdef LUSTRE_KERNEL_VERSION
2661 .setattr_raw = ll_setattr_raw,
2663 .setattr = ll_setattr,
2664 .truncate = ll_truncate,
2665 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2666 .getattr = ll_getattr,
2668 .revalidate_it = ll_inode_revalidate_it,
2670 .permission = ll_inode_permission,
2671 .setxattr = ll_setxattr,
2672 .getxattr = ll_getxattr,
2673 .listxattr = ll_listxattr,
2674 .removexattr = ll_removexattr,