1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <linux/pagemap.h>
29 #include <linux/file.h>
30 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
31 #include <linux/lustre_compat25.h>
33 #include "llite_internal.h"
35 /* also used by llite/special.c:ll_special_open() */
36 struct ll_file_data *ll_file_data_get(void)
38 struct ll_file_data *fd;
40 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
44 static void ll_file_data_put(struct ll_file_data *fd)
47 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
50 static int ll_close_inode_openhandle(struct inode *inode,
51 struct obd_client_handle *och)
53 struct ptlrpc_request *req = NULL;
54 struct obd_device *obd;
59 obd = class_exp2obd(ll_i2mdcexp(inode));
61 CERROR("Invalid MDC connection handle "LPX64"\n",
62 ll_i2mdcexp(inode)->exp_handle.h_cookie);
67 * here we check if this is forced umount. If so this is called on
68 * canceling "open lock" and we do not call mdc_close() in this case, as
69 * it will not be successful, as import is already deactivated.
76 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
78 oa->o_id = inode->i_ino;
79 oa->o_valid = OBD_MD_FLID;
80 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
81 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
82 OBD_MD_FLATIME | OBD_MD_FLMTIME |
84 if (ll_is_inode_dirty(inode)) {
85 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
86 oa->o_valid |= OBD_MD_FLFLAGS;
89 rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
91 /* We are the last writer, so the MDS has instructed us to get
92 * the file size and any write cookies, then close again. */
93 ll_queue_done_writing(inode);
96 CERROR("inode %lu mdc close failed: rc = %d\n",
103 rc = ll_objects_destroy(req, inode);
105 CERROR("inode %lu ll_objects destroy: rc = %d\n",
109 ptlrpc_req_finished(req); /* This is close request */
112 mdc_clear_open_replay_data(och);
117 int ll_mdc_real_close(struct inode *inode, int flags)
119 struct ll_inode_info *lli = ll_i2info(inode);
121 struct obd_client_handle **och_p;
122 struct obd_client_handle *och;
127 if (flags & FMODE_WRITE) {
128 och_p = &lli->lli_mds_write_och;
129 och_usecount = &lli->lli_open_fd_write_count;
130 } else if (flags & FMODE_EXEC) {
131 och_p = &lli->lli_mds_exec_och;
132 och_usecount = &lli->lli_open_fd_exec_count;
134 LASSERT(flags & FMODE_READ);
135 och_p = &lli->lli_mds_read_och;
136 och_usecount = &lli->lli_open_fd_read_count;
139 down(&lli->lli_och_sem);
140 if (*och_usecount) { /* There are still users of this handle, so
142 up(&lli->lli_och_sem);
147 up(&lli->lli_och_sem);
149 if (och) { /* There might be a race and somebody have freed this och
151 rc = ll_close_inode_openhandle(inode, och);
152 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
153 OBD_FREE(och, sizeof *och);
159 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
162 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
163 struct ll_inode_info *lli = ll_i2info(inode);
167 /* clear group lock, if present */
168 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
169 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
170 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
171 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
175 /* Let's see if we have good enough OPEN lock on the file and if
176 we can skip talking to MDS */
177 if (file->f_dentry->d_inode) { /* Can this ever be false? */
179 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
180 struct lustre_handle lockh;
181 struct inode *inode = file->f_dentry->d_inode;
182 struct ldlm_res_id file_res_id = {.name={inode->i_ino,
183 inode->i_generation}};
184 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
186 down(&lli->lli_och_sem);
187 if (fd->fd_omode & FMODE_WRITE) {
189 LASSERT(lli->lli_open_fd_write_count);
190 lli->lli_open_fd_write_count--;
191 } else if (fd->fd_omode & FMODE_EXEC) {
193 LASSERT(lli->lli_open_fd_exec_count);
194 lli->lli_open_fd_exec_count--;
197 LASSERT(lli->lli_open_fd_read_count);
198 lli->lli_open_fd_read_count--;
200 up(&lli->lli_och_sem);
202 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
203 &file_res_id, LDLM_IBITS, &policy,lockmode,
205 rc = ll_mdc_real_close(file->f_dentry->d_inode,
209 CERROR("Releasing a file %p with negative dentry %p. Name %s",
210 file, file->f_dentry, file->f_dentry->d_name.name);
213 LUSTRE_FPRIVATE(file) = NULL;
214 ll_file_data_put(fd);
219 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
221 /* While this returns an error code, fput() the caller does not, so we need
222 * to make every effort to clean up all of our state here. Also, applications
223 * rarely check close errors and even if an error is returned they will not
224 * re-try the close call.
226 int ll_file_release(struct inode *inode, struct file *file)
228 struct ll_file_data *fd;
229 struct ll_sb_info *sbi = ll_i2sbi(inode);
230 struct ll_inode_info *lli = ll_i2info(inode);
231 struct lov_stripe_md *lsm = lli->lli_smd;
235 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
236 inode->i_generation, inode);
238 if (S_ISDIR(inode->i_mode))
239 ll_stop_statahead(inode);
241 /* don't do anything for / */
242 if (inode->i_sb->s_root == file->f_dentry)
245 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
246 fd = LUSTRE_FPRIVATE(file);
250 lov_test_and_clear_async_rc(lsm);
251 lli->lli_async_rc = 0;
253 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
257 static int ll_intent_file_open(struct file *file, void *lmm,
258 int lmmsize, struct lookup_intent *itp)
260 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
261 struct mdc_op_data data;
262 struct dentry *parent = file->f_dentry->d_parent;
263 const char *name = file->f_dentry->d_name.name;
264 const int len = file->f_dentry->d_name.len;
265 struct inode *inode = file->f_dentry->d_inode;
266 struct ptlrpc_request *req;
273 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
274 name, len, O_RDWR, NULL);
276 /* Usually we come here only for NFSD, and we want open lock.
277 But we can also get here with pre 2.6.15 patchless kernels, and in
278 that case that lock is also ok */
279 /* We can also get here if there was cached open handle in revalidate_it
280 * but it disappeared while we were getting from there to ll_file_open.
281 * But this means this file was closed and immediatelly opened which
282 * makes a good candidate for using OPEN lock */
283 /* If lmmsize & lmm are not 0, we are just setting stripe info
284 * parameters. No need for the open lock */
285 if (!lmm && !lmmsize)
286 itp->it_flags |= MDS_OPEN_LOCK;
288 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
289 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
291 /* reason for keep own exit path - don`t flood log
292 * with messages with -ESTALE errors.
294 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
295 it_open_error(DISP_OPEN_OPEN, itp))
297 ll_release_openhandle(file->f_dentry, itp);
301 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
302 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
303 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
307 if (itp->d.lustre.it_lock_mode)
308 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
311 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
312 req, DLM_REPLY_REC_OFF, NULL);
314 ptlrpc_req_finished(itp->d.lustre.it_data);
317 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
318 ll_intent_drop_lock(itp);
324 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
325 struct obd_client_handle *och)
327 struct ptlrpc_request *req = it->d.lustre.it_data;
328 struct mds_body *body;
332 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
333 LASSERT(body != NULL); /* reply already checked out */
334 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in mdc_enqueue */
336 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
337 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
338 lli->lli_io_epoch = body->io_epoch;
340 mdc_set_open_replay_data(och, it->d.lustre.it_data);
343 int ll_local_open(struct file *file, struct lookup_intent *it,
344 struct ll_file_data *fd, struct obd_client_handle *och)
348 LASSERT(!LUSTRE_FPRIVATE(file));
353 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
354 LUSTRE_FPRIVATE(file) = fd;
355 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
356 fd->fd_omode = it->it_flags;
361 /* Open a file, and (for the very first open) create objects on the OSTs at
362 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
363 * creation or open until ll_lov_setstripe() ioctl is called. We grab
364 * lli_open_sem to ensure no other process will create objects, send the
365 * stripe MD to the MDS, or try to destroy the objects if that fails.
367 * If we already have the stripe MD locally then we don't request it in
368 * mdc_open(), by passing a lmm_size = 0.
370 * It is up to the application to ensure no other processes open this file
371 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
372 * used. We might be able to avoid races of that sort by getting lli_open_sem
373 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
374 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
376 int ll_file_open(struct inode *inode, struct file *file)
378 struct ll_inode_info *lli = ll_i2info(inode);
379 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
380 .it_flags = file->f_flags };
381 struct lov_stripe_md *lsm;
382 struct ptlrpc_request *req = NULL;
383 struct obd_client_handle **och_p;
385 struct ll_file_data *fd;
389 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
390 inode->i_generation, inode, file->f_flags);
392 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_pid == 0)
393 lli->lli_opendir_pid = current->pid;
395 /* don't do anything for / */
396 if (inode->i_sb->s_root == file->f_dentry)
399 #ifdef HAVE_VFS_INTENT_PATCHES
402 it = file->private_data; /* XXX: compat macro */
403 file->private_data = NULL; /* prevent ll_local_open assertion */
406 fd = ll_file_data_get();
408 lli->lli_opendir_pid = 0;
411 if (!it || !it->d.lustre.it_disposition) {
412 /* Convert f_flags into access mode. We cannot use file->f_mode,
413 * because everything but O_ACCMODE mask was stripped from it */
414 if ((oit.it_flags + 1) & O_ACCMODE)
416 if (file->f_flags & O_TRUNC)
417 oit.it_flags |= FMODE_WRITE;
419 /* kernel only call f_op->open in dentry_open. filp_open calls
420 * dentry_open after call to open_namei that checks permissions.
421 * Only nfsd_open call dentry_open directly without checking
422 * permissions and because of that this code below is safe. */
423 if (oit.it_flags & FMODE_WRITE)
424 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
426 /* We do not want O_EXCL here, presumably we opened the file
427 * already? XXX - NFS implications? */
428 oit.it_flags &= ~O_EXCL;
433 /* Let's see if we have file open on MDS already. */
434 if (it->it_flags & FMODE_WRITE) {
435 och_p = &lli->lli_mds_write_och;
436 och_usecount = &lli->lli_open_fd_write_count;
437 } else if (it->it_flags & FMODE_EXEC) {
438 och_p = &lli->lli_mds_exec_och;
439 och_usecount = &lli->lli_open_fd_exec_count;
441 och_p = &lli->lli_mds_read_och;
442 och_usecount = &lli->lli_open_fd_read_count;
445 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
446 it->d.lustre.it_disposition);
448 down(&lli->lli_och_sem);
449 if (*och_p) { /* Open handle is present */
450 if (it_disposition(it, DISP_OPEN_OPEN)) {
451 /* Well, there's extra open request that we do not need,
452 let's close it somehow. This will decref request. */
453 rc = it_open_error(DISP_OPEN_OPEN, it);
455 ll_file_data_put(fd);
456 GOTO(out_och_free, rc);
458 ll_release_openhandle(file->f_dentry, it);
459 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
464 rc = ll_local_open(file, it, fd, NULL);
466 LASSERTF(rc == 0, "rc = %d\n", rc);
468 LASSERT(*och_usecount == 0);
469 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
471 ll_file_data_put(fd);
472 GOTO(out_och_free, rc = -ENOMEM);
475 if (!it->d.lustre.it_disposition) {
476 rc = ll_intent_file_open(file, NULL, 0, it);
478 ll_file_data_put(fd);
479 GOTO(out_och_free, rc);
482 /* Got some error? Release the request */
483 if (it->d.lustre.it_status < 0) {
484 req = it->d.lustre.it_data;
485 ptlrpc_req_finished(req);
487 mdc_set_lock_data(&it->d.lustre.it_lock_handle,
488 file->f_dentry->d_inode);
490 req = it->d.lustre.it_data;
492 /* mdc_intent_lock() didn't get a request ref if there was an
493 * open error, so don't do cleanup on the request here
495 /* XXX (green): Should not we bail out on any error here, not
496 * just open error? */
497 rc = it_open_error(DISP_OPEN_OPEN, it);
499 ll_file_data_put(fd);
500 GOTO(out_och_free, rc);
503 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
504 rc = ll_local_open(file, it, fd, *och_p);
505 LASSERTF(rc == 0, "rc = %d\n", rc);
507 up(&lli->lli_och_sem);
509 /* Must do this outside lli_och_sem lock to prevent deadlock where
510 different kind of OPEN lock for this same inode gets cancelled
511 by ldlm_cancel_lru */
512 if (!S_ISREG(inode->i_mode))
517 if (file->f_flags & O_LOV_DELAY_CREATE ||
518 !(file->f_mode & FMODE_WRITE)) {
519 CDEBUG(D_INODE, "object creation was delayed\n");
523 file->f_flags &= ~O_LOV_DELAY_CREATE;
526 ptlrpc_req_finished(req);
528 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
530 ll_open_complete(inode);
534 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
535 *och_p = NULL; /* OBD_FREE writes some magic there */
538 up(&lli->lli_och_sem);
539 lli->lli_opendir_pid = 0;
544 /* Fills the obdo with the attributes for the inode defined by lsm */
545 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
548 struct ptlrpc_request_set *set;
549 struct obd_info oinfo = { { { 0 } } };
553 LASSERT(lsm != NULL);
555 memset(oa, 0, sizeof *oa);
558 oa->o_id = lsm->lsm_object_id;
559 oa->o_mode = S_IFREG;
560 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
561 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
564 set = ptlrpc_prep_set();
568 rc = obd_getattr_async(exp, &oinfo, set);
570 rc = ptlrpc_set_wait(set);
571 ptlrpc_set_destroy(set);
576 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
577 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
581 static inline void ll_remove_suid(struct inode *inode)
585 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
586 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
588 /* was any of the uid bits set? */
589 mode &= inode->i_mode;
590 if (mode && !capable(CAP_FSETID)) {
591 inode->i_mode &= ~mode;
592 // XXX careful here - we cannot change the size
596 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
598 struct ll_inode_info *lli = ll_i2info(inode);
599 struct lov_stripe_md *lsm = lli->lli_smd;
600 struct obd_export *exp = ll_i2obdexp(inode);
603 struct ldlm_lock *lock;
604 struct lov_stripe_md *lsm;
605 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
606 __u32 stripe, vallen = sizeof(stripe);
610 if (lsm->lsm_stripe_count == 1)
611 GOTO(check, stripe = 0);
613 /* get our offset in the lov */
614 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
616 CERROR("obd_get_info: rc = %d\n", rc);
619 LASSERT(stripe < lsm->lsm_stripe_count);
622 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
623 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){
624 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
625 lsm->lsm_oinfo[stripe]->loi_id,
626 lsm->lsm_oinfo[stripe]->loi_gr);
627 RETURN(-ELDLM_NO_LOCK_DATA);
633 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
634 * we get a lock cancellation for each stripe, so we have to map the obd's
635 * region back onto the stripes in the file that it held.
637 * No one can dirty the extent until we've finished our work and they can
638 * enqueue another lock. The DLM protects us from ll_file_read/write here,
639 * but other kernel actors could have pages locked.
641 * Called with the DLM lock held. */
642 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
643 struct ldlm_lock *lock, __u32 stripe)
645 ldlm_policy_data_t tmpex;
646 unsigned long start, end, count, skip, i, j;
648 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
649 struct lustre_handle lockh;
652 memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
653 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
654 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
657 /* our locks are page granular thanks to osc_enqueue, we invalidate the
659 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
660 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
661 LDLM_ERROR(lock, "lock not aligned on CFS_PAGE_SIZE %lu", CFS_PAGE_SIZE);
662 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
663 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
667 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
668 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
669 if (lsm->lsm_stripe_count > 1) {
670 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
671 skip = (lsm->lsm_stripe_count - 1) * count;
672 start += start/count * skip + stripe * count;
674 end += end/count * skip + stripe * count;
676 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
679 i = i_size_read(inode) ? (i_size_read(inode) - 1) >> CFS_PAGE_SHIFT : 0;
683 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
684 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
685 count, skip, end, discard ? " (DISCARDING)" : "");
687 /* walk through the vmas on the inode and tear down mmaped pages that
688 * intersect with the lock. this stops immediately if there are no
689 * mmap()ed regions of the file. This is not efficient at all and
690 * should be short lived. We'll associate mmap()ed pages with the lock
691 * and will be able to find them directly */
692 for (i = start; i <= end; i += (j + skip)) {
693 j = min(count - (i % count), end - i + 1);
695 LASSERT(inode->i_mapping);
696 if (ll_teardown_mmaps(inode->i_mapping,
697 (__u64)i << CFS_PAGE_SHIFT,
698 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
702 /* this is the simplistic implementation of page eviction at
703 * cancelation. It is careful to get races with other page
704 * lockers handled correctly. fixes from bug 20 will make it
705 * more efficient by associating locks with pages and with
706 * batching writeback under the lock explicitly. */
707 for (i = start, j = start % count; i <= end;
708 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
710 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
716 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
717 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
718 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
721 if (!mapping_has_pages(inode->i_mapping)) {
722 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
728 page = find_get_page(inode->i_mapping, i);
731 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
732 i, tmpex.l_extent.start);
735 /* page->mapping to check with racing against teardown */
736 if (!discard && clear_page_dirty_for_io(page)) {
737 rc = ll_call_writepage(inode, page);
739 CERROR("writepage of page %p failed: %d\n",
741 /* either waiting for io to complete or reacquiring
742 * the lock that the failed writepage released */
746 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
747 /* check to see if another DLM lock covers this page b=2765 */
748 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
749 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
751 &lock->l_resource->lr_name, LDLM_EXTENT,
752 &tmpex, LCK_PR | LCK_PW, &lockh);
753 if (rc2 == 0 && page->mapping != NULL) {
754 struct ll_async_page *llap = llap_cast_private(page);
755 // checking again to account for writeback's lock_page()
756 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
758 ll_ra_accounting(llap, inode->i_mapping);
759 ll_truncate_complete_page(page);
762 page_cache_release(page);
764 LASSERTF(tmpex.l_extent.start <=
765 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
766 lock->l_policy_data.l_extent.end + 1),
767 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
768 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
773 static int ll_extent_lock_callback(struct ldlm_lock *lock,
774 struct ldlm_lock_desc *new, void *data,
777 struct lustre_handle lockh = { 0 };
781 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
782 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
787 case LDLM_CB_BLOCKING:
788 ldlm_lock2handle(lock, &lockh);
789 rc = ldlm_cli_cancel(&lockh);
791 CERROR("ldlm_cli_cancel failed: %d\n", rc);
793 case LDLM_CB_CANCELING: {
795 struct ll_inode_info *lli;
796 struct lov_stripe_md *lsm;
800 /* This lock wasn't granted, don't try to evict pages */
801 if (lock->l_req_mode != lock->l_granted_mode)
804 inode = ll_inode_from_lock(lock);
807 lli = ll_i2info(inode);
810 if (lli->lli_smd == NULL)
814 stripe = ll_lock_to_stripe_offset(inode, lock);
818 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
820 lov_stripe_lock(lsm);
821 lock_res_and_lock(lock);
822 kms = ldlm_extent_shift_kms(lock,
823 lsm->lsm_oinfo[stripe]->loi_kms);
825 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
826 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
827 lsm->lsm_oinfo[stripe]->loi_kms, kms);
828 lsm->lsm_oinfo[stripe]->loi_kms = kms;
829 unlock_res_and_lock(lock);
830 lov_stripe_unlock(lsm);
831 ll_try_done_writing(inode);
844 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
846 /* XXX ALLOCATE - 160 bytes */
847 struct inode *inode = ll_inode_from_lock(lock);
848 struct ll_inode_info *lli = ll_i2info(inode);
849 struct lustre_handle lockh = { 0 };
854 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
855 LDLM_FL_BLOCK_CONV)) {
856 LBUG(); /* not expecting any blocked async locks yet */
857 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
859 ldlm_lock_dump(D_OTHER, lock, 0);
860 ldlm_reprocess_all(lock->l_resource);
864 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
866 stripe = ll_lock_to_stripe_offset(inode, lock);
870 if (lock->l_lvb_len) {
871 struct lov_stripe_md *lsm = lli->lli_smd;
873 lvb = lock->l_lvb_data;
874 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
876 lock_res_and_lock(lock);
877 ll_inode_size_lock(inode, 1);
878 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
879 kms = ldlm_extent_shift_kms(NULL, kms);
880 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
881 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
882 lsm->lsm_oinfo[stripe].loi_kms, kms);
883 lsm->lsm_oinfo[stripe].loi_kms = kms;
884 ll_inode_size_unlock(inode, 1);
885 unlock_res_and_lock(lock);
890 wake_up(&lock->l_waitq);
892 ldlm_lock2handle(lock, &lockh);
893 ldlm_lock_decref(&lockh, LCK_PR);
898 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
900 struct ptlrpc_request *req = reqp;
901 struct inode *inode = ll_inode_from_lock(lock);
902 struct ll_inode_info *lli;
903 struct lov_stripe_md *lsm;
906 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
910 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
911 lli = ll_i2info(inode);
913 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
916 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
918 /* First, find out which stripe index this lock corresponds to. */
919 stripe = ll_lock_to_stripe_offset(inode, lock);
921 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
923 rc = lustre_pack_reply(req, 2, size, NULL);
925 CERROR("lustre_pack_reply: %d\n", rc);
929 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
930 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
931 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
932 lvb->lvb_atime = LTIME_S(inode->i_atime);
933 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
935 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
936 " atime "LPU64", mtime "LPU64", ctime "LPU64,
937 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
938 lvb->lvb_atime, lvb->lvb_ctime);
943 /* These errors are normal races, so we don't want to fill the console
944 * with messages by calling ptlrpc_error() */
945 if (rc == -ELDLM_NO_LOCK_DATA)
946 lustre_pack_reply(req, 1, NULL, NULL);
952 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
955 struct lustre_handle lockh = { 0 };
956 struct ldlm_enqueue_info einfo = { 0 };
957 struct obd_info oinfo = { { { 0 } } };
963 einfo.ei_type = LDLM_EXTENT;
964 einfo.ei_mode = LCK_PR;
965 einfo.ei_cb_bl = ll_extent_lock_callback;
966 einfo.ei_cb_cp = ldlm_completion_ast;
967 einfo.ei_cb_gl = ll_glimpse_callback;
968 einfo.ei_cbdata = NULL;
970 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
971 oinfo.oi_lockh = &lockh;
973 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
975 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
979 CERROR("obd_enqueue returned rc %d, "
980 "returning -EIO\n", rc);
981 RETURN(rc > 0 ? -EIO : rc);
984 lov_stripe_lock(lsm);
985 memset(&lvb, 0, sizeof(lvb));
986 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
987 st->st_size = lvb.lvb_size;
988 st->st_blocks = lvb.lvb_blocks;
989 st->st_mtime = lvb.lvb_mtime;
990 st->st_atime = lvb.lvb_atime;
991 st->st_ctime = lvb.lvb_ctime;
992 lov_stripe_unlock(lsm);
997 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
998 * file (because it prefers KMS over RSS when larger) */
999 int ll_glimpse_size(struct inode *inode, int ast_flags)
1001 struct ll_inode_info *lli = ll_i2info(inode);
1002 struct ll_sb_info *sbi = ll_i2sbi(inode);
1003 struct lustre_handle lockh = { 0 };
1004 struct ldlm_enqueue_info einfo = { 0 };
1005 struct obd_info oinfo = { { { 0 } } };
1010 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1012 if (!lli->lli_smd) {
1013 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1017 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1018 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1019 * won't revoke any conflicting DLM locks held. Instead,
1020 * ll_glimpse_callback() will be called on each client
1021 * holding a DLM lock against this file, and resulting size
1022 * will be returned for each stripe. DLM lock on [0, EOF] is
1023 * acquired only if there were no conflicting locks. */
1024 einfo.ei_type = LDLM_EXTENT;
1025 einfo.ei_mode = LCK_PR;
1026 einfo.ei_cb_bl = ll_extent_lock_callback;
1027 einfo.ei_cb_cp = ldlm_completion_ast;
1028 einfo.ei_cb_gl = ll_glimpse_callback;
1029 einfo.ei_cbdata = inode;
1031 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1032 oinfo.oi_lockh = &lockh;
1033 oinfo.oi_md = lli->lli_smd;
1034 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1036 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
1040 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1041 RETURN(rc > 0 ? -EIO : rc);
1044 ll_inode_size_lock(inode, 1);
1045 inode_init_lvb(inode, &lvb);
1046 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1047 i_size_write(inode, lvb.lvb_size);
1048 inode->i_blocks = lvb.lvb_blocks;
1049 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1050 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1051 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1052 ll_inode_size_unlock(inode, 1);
1054 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1055 i_size_read(inode), (long long)inode->i_blocks);
1060 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1061 struct lov_stripe_md *lsm, int mode,
1062 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1065 struct ll_sb_info *sbi = ll_i2sbi(inode);
1067 struct ldlm_enqueue_info einfo = { 0 };
1068 struct obd_info oinfo = { { { 0 } } };
1072 LASSERT(!lustre_handle_is_used(lockh));
1073 LASSERT(lsm != NULL);
1075 /* don't drop the mmapped file to LRU */
1076 if (mapping_mapped(inode->i_mapping))
1077 ast_flags |= LDLM_FL_NO_LRU;
1079 /* XXX phil: can we do this? won't it screw the file size up? */
1080 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1081 (sbi->ll_flags & LL_SBI_NOLCK))
1084 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1085 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1087 einfo.ei_type = LDLM_EXTENT;
1088 einfo.ei_mode = mode;
1089 einfo.ei_cb_bl = ll_extent_lock_callback;
1090 einfo.ei_cb_cp = ldlm_completion_ast;
1091 einfo.ei_cb_gl = ll_glimpse_callback;
1092 einfo.ei_cbdata = inode;
1094 oinfo.oi_policy = *policy;
1095 oinfo.oi_lockh = lockh;
1097 oinfo.oi_flags = ast_flags;
1099 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1100 *policy = oinfo.oi_policy;
1104 ll_inode_size_lock(inode, 1);
1105 inode_init_lvb(inode, &lvb);
1106 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1108 if (policy->l_extent.start == 0 &&
1109 policy->l_extent.end == OBD_OBJECT_EOF) {
1110 /* vmtruncate()->ll_truncate() first sets the i_size and then
1111 * the kms under both a DLM lock and the
1112 * ll_inode_size_lock(). If we don't get the
1113 * ll_inode_size_lock() here we can match the DLM lock and
1114 * reset i_size from the kms before the truncating path has
1115 * updated the kms. generic_file_write can then trust the
1116 * stale i_size when doing appending writes and effectively
1117 * cancel the result of the truncate. Getting the
1118 * ll_inode_size_lock() after the enqueue maintains the DLM
1119 * -> ll_inode_size_lock() acquiring order. */
1120 i_size_write(inode, lvb.lvb_size);
1121 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1122 inode->i_ino, i_size_read(inode));
1126 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1127 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1128 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1130 ll_inode_size_unlock(inode, 1);
1135 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1136 struct lov_stripe_md *lsm, int mode,
1137 struct lustre_handle *lockh)
1139 struct ll_sb_info *sbi = ll_i2sbi(inode);
1143 /* XXX phil: can we do this? won't it screw the file size up? */
1144 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1145 (sbi->ll_flags & LL_SBI_NOLCK))
1148 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
1153 static void ll_set_file_contended(struct inode *inode)
1155 struct ll_inode_info *lli = ll_i2info(inode);
1157 lli->lli_contention_time = cfs_time_current();
1158 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1161 void ll_clear_file_contended(struct inode *inode)
1163 struct ll_inode_info *lli = ll_i2info(inode);
1165 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1168 static int ll_is_file_contended(struct file *file)
1170 struct inode *inode = file->f_dentry->d_inode;
1171 struct ll_inode_info *lli = ll_i2info(inode);
1172 struct ll_sb_info *sbi = ll_i2sbi(inode);
1173 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1176 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1177 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1178 " osc connect flags = 0x"LPX64"\n",
1179 sbi->ll_lco.lco_flags);
1182 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1184 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1185 cfs_time_t cur_time = cfs_time_current();
1186 cfs_time_t retry_time;
1188 retry_time = cfs_time_add(
1189 lli->lli_contention_time,
1190 cfs_time_seconds(sbi->ll_contention_time));
1191 if (cfs_time_after(cur_time, retry_time)) {
1192 ll_clear_file_contended(inode);
1200 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1201 const char *buf, size_t count,
1202 loff_t start, loff_t end, int rw)
1205 int tree_locked = 0;
1207 struct inode * inode = file->f_dentry->d_inode;
1209 append = (rw == WRITE) && (file->f_flags & O_APPEND);
1211 if (append || !ll_is_file_contended(file)) {
1212 struct ll_lock_tree_node *node;
1215 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1216 if (file->f_flags & O_NONBLOCK)
1217 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1218 node = ll_node_from_inode(inode, start, end,
1219 (rw == WRITE) ? LCK_PW : LCK_PR);
1224 tree->lt_fd = LUSTRE_FPRIVATE(file);
1225 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1228 else if (rc == -EUSERS)
1229 ll_set_file_contended(inode);
1233 RETURN(tree_locked);
1238 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1241 struct inode *inode = file->f_dentry->d_inode;
1242 struct ll_inode_info *lli = ll_i2info(inode);
1243 struct lov_stripe_md *lsm = lli->lli_smd;
1244 struct ll_sb_info *sbi = ll_i2sbi(inode);
1245 struct ll_lock_tree tree;
1247 struct ll_ra_read bead;
1250 ssize_t retval, chunk, sum = 0;
1255 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1256 inode->i_ino, inode->i_generation, inode, count, *ppos);
1257 /* "If nbyte is 0, read() will return 0 and have no other results."
1258 * -- Single Unix Spec */
1262 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1265 /* Read on file with no objects should return zero-filled
1266 * buffers up to file size (we can get non-zero sizes with
1267 * mknod + truncate, then opening file for read. This is a
1268 * common pattern in NFS case, it seems). Bug 6243 */
1270 /* Since there are no objects on OSTs, we have nothing to get
1271 * lock on and so we are forced to access inode->i_size
1274 /* Read beyond end of file */
1275 if (*ppos >= i_size_read(inode))
1278 if (count > i_size_read(inode) - *ppos)
1279 count = i_size_read(inode) - *ppos;
1280 /* Make sure to correctly adjust the file pos pointer for
1282 notzeroed = clear_user(buf, count);
1290 if (sbi->ll_max_rw_chunk != 0) {
1291 /* first, let's know the end of the current stripe */
1293 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1296 /* correct, the end is beyond the request */
1297 if (end > *ppos + count - 1)
1298 end = *ppos + count - 1;
1300 /* and chunk shouldn't be too large even if striping is wide */
1301 if (end - *ppos > sbi->ll_max_rw_chunk)
1302 end = *ppos + sbi->ll_max_rw_chunk - 1;
1304 end = *ppos + count - 1;
1307 tree_locked = ll_file_get_tree_lock(&tree, file, buf,
1308 count, *ppos, end, READ);
1309 if (tree_locked < 0)
1310 GOTO(out, retval = tree_locked);
1312 ll_inode_size_lock(inode, 1);
1314 * Consistency guarantees: following possibilities exist for the
1315 * relation between region being read and real file size at this
1318 * (A): the region is completely inside of the file;
1320 * (B-x): x bytes of region are inside of the file, the rest is
1323 * (C): the region is completely outside of the file.
1325 * This classification is stable under DLM lock acquired by
1326 * ll_tree_lock() above, because to change class, other client has to
1327 * take DLM lock conflicting with our lock. Also, any updates to
1328 * ->i_size by other threads on this client are serialized by
1329 * ll_inode_size_lock(). This guarantees that short reads are handled
1330 * correctly in the face of concurrent writes and truncates.
1332 inode_init_lvb(inode, &lvb);
1333 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1335 if (*ppos + count - 1 > kms) {
1336 /* A glimpse is necessary to determine whether we return a
1337 * short read (B) or some zeroes at the end of the buffer (C) */
1338 ll_inode_size_unlock(inode, 1);
1339 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1341 ll_tree_unlock(&tree);
1345 /* region is within kms and, hence, within real file size (A).
1346 * We need to increase i_size to cover the read region so that
1347 * generic_file_read() will do its job, but that doesn't mean
1348 * the kms size is _correct_, it is only the _minimum_ size.
1349 * If someone does a stat they will get the correct size which
1350 * will always be >= the kms value here. b=11081 */
1351 if (i_size_read(inode) < kms)
1352 i_size_write(inode, kms);
1353 ll_inode_size_unlock(inode, 1);
1356 chunk = end - *ppos + 1;
1357 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1358 inode->i_ino, chunk, *ppos, i_size_read(inode));
1360 /* turn off the kernel's read-ahead */
1362 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1365 file->f_ra.ra_pages = 0;
1367 /* initialize read-ahead window once per syscall */
1370 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1371 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1372 ll_ra_read_in(file, &bead);
1376 file_accessed(file);
1377 retval = generic_file_read(file, buf, chunk, ppos);
1378 ll_tree_unlock(&tree);
1380 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1382 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1387 if (retval == chunk && count > 0)
1393 ll_ra_read_ex(file, &bead);
1394 retval = (sum > 0) ? sum : retval;
1399 * Write to a file (through the page cache).
1401 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1404 struct inode *inode = file->f_dentry->d_inode;
1405 struct ll_sb_info *sbi = ll_i2sbi(inode);
1406 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1407 struct ll_lock_tree tree;
1408 loff_t maxbytes = ll_file_maxbytes(inode);
1409 loff_t lock_start, lock_end, end;
1410 ssize_t retval, chunk, sum = 0;
1414 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1415 inode->i_ino, inode->i_generation, inode, count, *ppos);
1417 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1419 /* POSIX, but surprised the VFS doesn't check this already */
1423 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1424 * called on the file, don't fail the below assertion (bug 2388). */
1425 if (file->f_flags & O_LOV_DELAY_CREATE &&
1426 ll_i2info(inode)->lli_smd == NULL)
1429 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1431 down(&ll_i2info(inode)->lli_write_sem);
1434 chunk = 0; /* just to fix gcc's warning */
1435 end = *ppos + count - 1;
1437 if (file->f_flags & O_APPEND) {
1439 lock_end = OBD_OBJECT_EOF;
1440 } else if (sbi->ll_max_rw_chunk != 0) {
1441 /* first, let's know the end of the current stripe */
1443 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1446 /* correct, the end is beyond the request */
1447 if (end > *ppos + count - 1)
1448 end = *ppos + count - 1;
1450 /* and chunk shouldn't be too large even if striping is wide */
1451 if (end - *ppos > sbi->ll_max_rw_chunk)
1452 end = *ppos + sbi->ll_max_rw_chunk - 1;
1457 lock_end = *ppos + count - 1;
1460 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1461 lock_start, lock_end, WRITE);
1462 if (tree_locked < 0)
1463 GOTO(out, retval = tree_locked);
1465 /* This is ok, g_f_w will overwrite this under i_sem if it races
1466 * with a local truncate, it just makes our maxbyte checking easier.
1467 * The i_size value gets updated in ll_extent_lock() as a consequence
1468 * of the [0,EOF] extent lock we requested above. */
1469 if (file->f_flags & O_APPEND) {
1470 *ppos = i_size_read(inode);
1471 end = *ppos + count - 1;
1474 if (*ppos >= maxbytes) {
1475 send_sig(SIGXFSZ, current, 0);
1476 GOTO(out_unlock, retval = -EFBIG);
1478 if (end > maxbytes - 1)
1481 /* generic_file_write handles O_APPEND after getting i_mutex */
1482 chunk = end - *ppos + 1;
1483 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1484 inode->i_ino, chunk, *ppos);
1486 retval = generic_file_write(file, buf, chunk, ppos);
1488 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1490 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1494 ll_tree_unlock(&tree);
1501 if (retval == chunk && count > 0)
1505 up(&ll_i2info(inode)->lli_write_sem);
1507 retval = (sum > 0) ? sum : retval;
1508 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1509 retval > 0 ? retval : 0);
1514 * Send file content (through pagecache) somewhere with helper
1516 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1517 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1518 read_actor_t actor, void *target)
1520 struct inode *inode = in_file->f_dentry->d_inode;
1521 struct ll_inode_info *lli = ll_i2info(inode);
1522 struct lov_stripe_md *lsm = lli->lli_smd;
1523 struct ll_lock_tree tree;
1524 struct ll_lock_tree_node *node;
1526 struct ll_ra_read bead;
1531 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1532 inode->i_ino, inode->i_generation, inode, count, *ppos);
1534 /* "If nbyte is 0, read() will return 0 and have no other results."
1535 * -- Single Unix Spec */
1539 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1540 /* turn off the kernel's read-ahead */
1541 in_file->f_ra.ra_pages = 0;
1543 /* File with no objects, nothing to lock */
1545 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1547 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1549 RETURN(PTR_ERR(node));
1551 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1552 rc = ll_tree_lock(&tree, node, NULL, count,
1553 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1557 ll_clear_file_contended(inode);
1558 ll_inode_size_lock(inode, 1);
1560 * Consistency guarantees: following possibilities exist for the
1561 * relation between region being read and real file size at this
1564 * (A): the region is completely inside of the file;
1566 * (B-x): x bytes of region are inside of the file, the rest is
1569 * (C): the region is completely outside of the file.
1571 * This classification is stable under DLM lock acquired by
1572 * ll_tree_lock() above, because to change class, other client has to
1573 * take DLM lock conflicting with our lock. Also, any updates to
1574 * ->i_size by other threads on this client are serialized by
1575 * ll_inode_size_lock(). This guarantees that short reads are handled
1576 * correctly in the face of concurrent writes and truncates.
1578 inode_init_lvb(inode, &lvb);
1579 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1581 if (*ppos + count - 1 > kms) {
1582 /* A glimpse is necessary to determine whether we return a
1583 * short read (B) or some zeroes at the end of the buffer (C) */
1584 ll_inode_size_unlock(inode, 1);
1585 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1589 /* region is within kms and, hence, within real file size (A) */
1590 i_size_write(inode, kms);
1591 ll_inode_size_unlock(inode, 1);
1594 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1595 inode->i_ino, count, *ppos, i_size_read(inode));
1597 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1598 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1599 ll_ra_read_in(in_file, &bead);
1601 file_accessed(in_file);
1602 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1603 ll_ra_read_ex(in_file, &bead);
1606 ll_tree_unlock(&tree);
1611 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1614 struct ll_inode_info *lli = ll_i2info(inode);
1615 struct obd_export *exp = ll_i2obdexp(inode);
1616 struct ll_recreate_obj ucreatp;
1617 struct obd_trans_info oti = { 0 };
1618 struct obdo *oa = NULL;
1621 struct lov_stripe_md *lsm, *lsm2;
1624 if (!capable (CAP_SYS_ADMIN))
1627 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1628 sizeof(struct ll_recreate_obj));
1636 down(&lli->lli_size_sem);
1639 GOTO(out, rc = -ENOENT);
1640 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1641 (lsm->lsm_stripe_count));
1643 OBD_ALLOC(lsm2, lsm_size);
1645 GOTO(out, rc = -ENOMEM);
1647 oa->o_id = ucreatp.lrc_id;
1648 oa->o_nlink = ucreatp.lrc_ost_idx;
1649 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1650 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
1651 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1652 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1654 oti.oti_objid = NULL;
1655 memcpy(lsm2, lsm, lsm_size);
1656 rc = obd_create(exp, oa, &lsm2, &oti);
1658 OBD_FREE(lsm2, lsm_size);
1661 up(&lli->lli_size_sem);
1666 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1667 int flags, struct lov_user_md *lum,
1670 struct ll_inode_info *lli = ll_i2info(inode);
1671 struct lov_stripe_md *lsm;
1672 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1676 down(&lli->lli_size_sem);
1679 up(&lli->lli_size_sem);
1680 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1685 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1688 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1689 GOTO(out_req_free, rc = -ENOENT);
1690 rc = oit.d.lustre.it_status;
1692 GOTO(out_req_free, rc);
1694 ll_release_openhandle(file->f_dentry, &oit);
1697 up(&lli->lli_size_sem);
1698 ll_intent_release(&oit);
1701 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1705 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1706 struct lov_mds_md **lmmp, int *lmm_size,
1707 struct ptlrpc_request **request)
1709 struct ll_sb_info *sbi = ll_i2sbi(inode);
1711 struct mds_body *body;
1712 struct lov_mds_md *lmm = NULL;
1713 struct ptlrpc_request *req = NULL;
1716 ll_inode2fid(&fid, inode);
1718 rc = ll_get_max_mdsize(sbi, &lmmsize);
1722 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
1723 filename, strlen(filename) + 1,
1724 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
1727 CDEBUG(D_INFO, "mdc_getattr_name failed "
1728 "on %s: rc %d\n", filename, rc);
1732 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1734 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1735 /* swabbed by mdc_getattr_name */
1736 LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1738 lmmsize = body->eadatasize;
1740 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1742 GOTO(out, rc = -ENODATA);
1745 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
1747 LASSERT(lmm != NULL);
1748 LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1751 * This is coming from the MDS, so is probably in
1752 * little endian. We convert it to host endian before
1753 * passing it to userspace.
1755 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1756 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1757 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1758 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1759 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1762 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1763 struct lov_stripe_md *lsm;
1764 struct lov_user_md_join *lmj;
1765 int lmj_size, i, aindex = 0;
1767 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1769 GOTO(out, rc = -ENOMEM);
1770 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1772 GOTO(out_free_memmd, rc);
1774 lmj_size = sizeof(struct lov_user_md_join) +
1775 lsm->lsm_stripe_count *
1776 sizeof(struct lov_user_ost_data_join);
1777 OBD_ALLOC(lmj, lmj_size);
1779 GOTO(out_free_memmd, rc = -ENOMEM);
1781 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1782 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1783 struct lov_extent *lex =
1784 &lsm->lsm_array->lai_ext_array[aindex];
1786 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1788 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1789 LPU64" len %d\n", aindex, i,
1790 lex->le_start, (int)lex->le_len);
1791 lmj->lmm_objects[i].l_extent_start =
1794 if ((int)lex->le_len == -1)
1795 lmj->lmm_objects[i].l_extent_end = -1;
1797 lmj->lmm_objects[i].l_extent_end =
1798 lex->le_start + lex->le_len;
1799 lmj->lmm_objects[i].l_object_id =
1800 lsm->lsm_oinfo[i]->loi_id;
1801 lmj->lmm_objects[i].l_object_gr =
1802 lsm->lsm_oinfo[i]->loi_gr;
1803 lmj->lmm_objects[i].l_ost_gen =
1804 lsm->lsm_oinfo[i]->loi_ost_gen;
1805 lmj->lmm_objects[i].l_ost_idx =
1806 lsm->lsm_oinfo[i]->loi_ost_idx;
1808 lmm = (struct lov_mds_md *)lmj;
1811 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1815 *lmm_size = lmmsize;
1819 static int ll_lov_setea(struct inode *inode, struct file *file,
1822 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1823 struct lov_user_md *lump;
1824 int lum_size = sizeof(struct lov_user_md) +
1825 sizeof(struct lov_user_ost_data);
1829 if (!capable (CAP_SYS_ADMIN))
1832 OBD_ALLOC(lump, lum_size);
1836 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1838 OBD_FREE(lump, lum_size);
1842 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1844 OBD_FREE(lump, lum_size);
1848 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1851 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1853 int flags = FMODE_WRITE;
1856 /* Bug 1152: copy properly when this is no longer true */
1857 LASSERT(sizeof(lum) == sizeof(*lump));
1858 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1859 rc = copy_from_user(&lum, lump, sizeof(lum));
1863 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1865 put_user(0, &lump->lmm_stripe_count);
1866 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
1867 0, ll_i2info(inode)->lli_smd, lump);
1872 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1874 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1879 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
1883 static int ll_get_grouplock(struct inode *inode, struct file *file,
1886 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1887 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1888 .end = OBD_OBJECT_EOF}};
1889 struct lustre_handle lockh = { 0 };
1890 struct ll_inode_info *lli = ll_i2info(inode);
1891 struct lov_stripe_md *lsm = lli->lli_smd;
1895 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1899 policy.l_extent.gid = arg;
1900 if (file->f_flags & O_NONBLOCK)
1901 flags = LDLM_FL_BLOCK_NOWAIT;
1903 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1907 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1909 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1914 static int ll_put_grouplock(struct inode *inode, struct file *file,
1917 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1918 struct ll_inode_info *lli = ll_i2info(inode);
1919 struct lov_stripe_md *lsm = lli->lli_smd;
1923 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1924 /* Ugh, it's already unlocked. */
1928 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1931 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1933 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1938 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1943 static int join_sanity_check(struct inode *head, struct inode *tail)
1946 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1947 CERROR("server do not support join \n");
1950 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1951 CERROR("tail ino %lu and ino head %lu must be regular\n",
1952 head->i_ino, tail->i_ino);
1955 if (head->i_ino == tail->i_ino) {
1956 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1959 if (i_size_read(head) % JOIN_FILE_ALIGN) {
1960 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1966 static int join_file(struct inode *head_inode, struct file *head_filp,
1967 struct file *tail_filp)
1969 struct dentry *tail_dentry = tail_filp->f_dentry;
1970 struct lookup_intent oit = {.it_op = IT_OPEN,
1971 .it_flags = head_filp->f_flags|O_JOIN_FILE};
1972 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
1973 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
1975 struct lustre_handle lockh;
1976 struct mdc_op_data *op_data;
1981 tail_dentry = tail_filp->f_dentry;
1983 OBD_ALLOC_PTR(op_data);
1984 if (op_data == NULL) {
1988 data = i_size_read(head_inode);
1989 ll_prepare_mdc_op_data(op_data, head_inode,
1990 tail_dentry->d_parent->d_inode,
1991 tail_dentry->d_name.name,
1992 tail_dentry->d_name.len, 0, &data);
1993 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
1994 op_data, &lockh, NULL, 0, 0);
1999 rc = oit.d.lustre.it_status;
2001 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2002 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2003 ptlrpc_req_finished((struct ptlrpc_request *)
2004 oit.d.lustre.it_data);
2008 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2010 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2011 oit.d.lustre.it_lock_mode = 0;
2013 ll_release_openhandle(head_filp->f_dentry, &oit);
2016 OBD_FREE_PTR(op_data);
2017 ll_intent_release(&oit);
2021 static int ll_file_join(struct inode *head, struct file *filp,
2022 char *filename_tail)
2024 struct inode *tail = NULL, *first = NULL, *second = NULL;
2025 struct dentry *tail_dentry;
2026 struct file *tail_filp, *first_filp, *second_filp;
2027 struct ll_lock_tree first_tree, second_tree;
2028 struct ll_lock_tree_node *first_node, *second_node;
2029 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2030 int rc = 0, cleanup_phase = 0;
2033 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2034 head->i_ino, head->i_generation, head, filename_tail);
2036 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2037 if (IS_ERR(tail_filp)) {
2038 CERROR("Can not open tail file %s", filename_tail);
2039 rc = PTR_ERR(tail_filp);
2042 tail = igrab(tail_filp->f_dentry->d_inode);
2044 tlli = ll_i2info(tail);
2045 tail_dentry = tail_filp->f_dentry;
2046 LASSERT(tail_dentry);
2049 /*reorder the inode for lock sequence*/
2050 first = head->i_ino > tail->i_ino ? head : tail;
2051 second = head->i_ino > tail->i_ino ? tail : head;
2052 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2053 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2055 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2056 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2057 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2058 if (IS_ERR(first_node)){
2059 rc = PTR_ERR(first_node);
2062 first_tree.lt_fd = first_filp->private_data;
2063 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2068 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2069 if (IS_ERR(second_node)){
2070 rc = PTR_ERR(second_node);
2073 second_tree.lt_fd = second_filp->private_data;
2074 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2079 rc = join_sanity_check(head, tail);
2083 rc = join_file(head, filp, tail_filp);
2087 switch (cleanup_phase) {
2089 ll_tree_unlock(&second_tree);
2090 obd_cancel_unused(ll_i2obdexp(second),
2091 ll_i2info(second)->lli_smd, 0, NULL);
2093 ll_tree_unlock(&first_tree);
2094 obd_cancel_unused(ll_i2obdexp(first),
2095 ll_i2info(first)->lli_smd, 0, NULL);
2097 filp_close(tail_filp, 0);
2100 if (head && rc == 0) {
2101 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2103 hlli->lli_smd = NULL;
2108 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2114 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2116 struct inode *inode = dentry->d_inode;
2117 struct obd_client_handle *och;
2123 /* Root ? Do nothing. */
2124 if (dentry->d_inode->i_sb->s_root == dentry)
2127 /* No open handle to close? Move away */
2128 if (!it_disposition(it, DISP_OPEN_OPEN))
2131 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2133 OBD_ALLOC(och, sizeof(*och));
2135 GOTO(out, rc = -ENOMEM);
2137 ll_och_fill(ll_i2info(inode), it, och);
2139 rc = ll_close_inode_openhandle(inode, och);
2141 OBD_FREE(och, sizeof(*och));
2143 /* this one is in place of ll_file_open */
2144 ptlrpc_req_finished(it->d.lustre.it_data);
2145 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2149 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2152 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2156 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2157 inode->i_generation, inode, cmd);
2158 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2160 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2161 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2165 case LL_IOC_GETFLAGS:
2166 /* Get the current value of the file flags */
2167 return put_user(fd->fd_flags, (int *)arg);
2168 case LL_IOC_SETFLAGS:
2169 case LL_IOC_CLRFLAGS:
2170 /* Set or clear specific file flags */
2171 /* XXX This probably needs checks to ensure the flags are
2172 * not abused, and to handle any flag side effects.
2174 if (get_user(flags, (int *) arg))
2177 if (cmd == LL_IOC_SETFLAGS) {
2178 if ((flags & LL_FILE_IGNORE_LOCK) &&
2179 !(file->f_flags & O_DIRECT)) {
2180 CERROR("%s: unable to disable locking on "
2181 "non-O_DIRECT file\n", current->comm);
2185 fd->fd_flags |= flags;
2187 fd->fd_flags &= ~flags;
2190 case LL_IOC_LOV_SETSTRIPE:
2191 RETURN(ll_lov_setstripe(inode, file, arg));
2192 case LL_IOC_LOV_SETEA:
2193 RETURN(ll_lov_setea(inode, file, arg));
2194 case LL_IOC_LOV_GETSTRIPE:
2195 RETURN(ll_lov_getstripe(inode, arg));
2196 case LL_IOC_RECREATE_OBJ:
2197 RETURN(ll_lov_recreate_obj(inode, file, arg));
2198 case EXT3_IOC_GETFLAGS:
2199 case EXT3_IOC_SETFLAGS:
2200 RETURN(ll_iocontrol(inode, file, cmd, arg));
2201 case EXT3_IOC_GETVERSION_OLD:
2202 case EXT3_IOC_GETVERSION:
2203 RETURN(put_user(inode->i_generation, (int *)arg));
2208 ftail = getname((const char *)arg);
2210 RETURN(PTR_ERR(ftail));
2211 rc = ll_file_join(inode, file, ftail);
2215 case LL_IOC_GROUP_LOCK:
2216 RETURN(ll_get_grouplock(inode, file, arg));
2217 case LL_IOC_GROUP_UNLOCK:
2218 RETURN(ll_put_grouplock(inode, file, arg));
2219 case IOC_OBD_STATFS:
2220 RETURN(ll_obd_statfs(inode, (void *)arg));
2221 case OBD_IOC_GETNAME_OLD:
2222 case OBD_IOC_GETNAME: {
2223 struct obd_device *obd =
2224 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
2227 if (copy_to_user((void *)arg, obd->obd_name,
2228 strlen(obd->obd_name) + 1))
2233 /* We need to special case any other ioctls we want to handle,
2234 * to send them to the MDS/OST as appropriate and to properly
2235 * network encode the arg field.
2236 case EXT3_IOC_SETVERSION_OLD:
2237 case EXT3_IOC_SETVERSION:
2243 ll_iocontrol_call(inode, file, cmd, arg, &err))
2246 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2252 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2254 struct inode *inode = file->f_dentry->d_inode;
2255 struct ll_inode_info *lli = ll_i2info(inode);
2256 struct lov_stripe_md *lsm = lli->lli_smd;
2259 retval = offset + ((origin == 2) ? i_size_read(inode) :
2260 (origin == 1) ? file->f_pos : 0);
2261 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2262 inode->i_ino, inode->i_generation, inode, retval, retval,
2263 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2264 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2266 if (origin == 2) { /* SEEK_END */
2267 int nonblock = 0, rc;
2269 if (file->f_flags & O_NONBLOCK)
2270 nonblock = LDLM_FL_BLOCK_NOWAIT;
2273 rc = ll_glimpse_size(inode, nonblock);
2278 ll_inode_size_lock(inode, 0);
2279 offset += i_size_read(inode);
2280 ll_inode_size_unlock(inode, 0);
2281 } else if (origin == 1) { /* SEEK_CUR */
2282 offset += file->f_pos;
2286 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2287 if (offset != file->f_pos) {
2288 file->f_pos = offset;
2289 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2291 file->f_version = ++event;
2300 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2302 struct inode *inode = dentry->d_inode;
2303 struct ll_inode_info *lli = ll_i2info(inode);
2304 struct lov_stripe_md *lsm = lli->lli_smd;
2306 struct ptlrpc_request *req;
2309 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2310 inode->i_generation, inode);
2311 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2313 /* fsync's caller has already called _fdata{sync,write}, we want
2314 * that IO to finish before calling the osc and mdc sync methods */
2315 rc = filemap_fdatawait(inode->i_mapping);
2317 /* catch async errors that were recorded back when async writeback
2318 * failed for pages in this mapping. */
2319 err = lli->lli_async_rc;
2320 lli->lli_async_rc = 0;
2324 err = lov_test_and_clear_async_rc(lsm);
2329 ll_inode2fid(&fid, inode);
2330 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
2334 ptlrpc_req_finished(req);
2341 RETURN(rc ? rc : -ENOMEM);
2343 oa->o_id = lsm->lsm_object_id;
2344 oa->o_valid = OBD_MD_FLID;
2345 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2346 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2348 err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
2358 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2360 struct inode *inode = file->f_dentry->d_inode;
2361 struct ll_sb_info *sbi = ll_i2sbi(inode);
2362 struct ldlm_res_id res_id =
2363 { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
2364 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2365 ldlm_flock_completion_ast, NULL, file_lock };
2366 struct lustre_handle lockh = {0};
2367 ldlm_policy_data_t flock;
2372 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2373 inode->i_ino, file_lock);
2374 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2376 if (file_lock->fl_flags & FL_FLOCK) {
2377 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2378 /* set missing params for flock() calls */
2379 file_lock->fl_end = OFFSET_MAX;
2380 file_lock->fl_pid = current->tgid;
2382 flock.l_flock.pid = file_lock->fl_pid;
2383 flock.l_flock.start = file_lock->fl_start;
2384 flock.l_flock.end = file_lock->fl_end;
2386 switch (file_lock->fl_type) {
2388 einfo.ei_mode = LCK_PR;
2391 /* An unlock request may or may not have any relation to
2392 * existing locks so we may not be able to pass a lock handle
2393 * via a normal ldlm_lock_cancel() request. The request may even
2394 * unlock a byte range in the middle of an existing lock. In
2395 * order to process an unlock request we need all of the same
2396 * information that is given with a normal read or write record
2397 * lock request. To avoid creating another ldlm unlock (cancel)
2398 * message we'll treat a LCK_NL flock request as an unlock. */
2399 einfo.ei_mode = LCK_NL;
2402 einfo.ei_mode = LCK_PW;
2405 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2420 flags = LDLM_FL_BLOCK_NOWAIT;
2426 flags = LDLM_FL_TEST_LOCK;
2427 /* Save the old mode so that if the mode in the lock changes we
2428 * can decrement the appropriate reader or writer refcount. */
2429 file_lock->fl_type = einfo.ei_mode;
2432 CERROR("unknown fcntl lock command: %d\n", cmd);
2436 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2437 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2438 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2440 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
2441 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2442 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2443 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2444 #ifdef HAVE_F_OP_FLOCK
2445 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2446 !(flags & LDLM_FL_TEST_LOCK))
2447 posix_lock_file_wait(file, file_lock);
2453 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2460 int ll_have_md_lock(struct inode *inode, __u64 bits)
2462 struct lustre_handle lockh;
2463 struct ldlm_res_id res_id = { .name = {0} };
2464 struct obd_device *obddev;
2465 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2472 obddev = ll_i2mdcexp(inode)->exp_obd;
2473 res_id.name[0] = inode->i_ino;
2474 res_id.name[1] = inode->i_generation;
2476 CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
2478 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2479 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
2480 &policy, LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2487 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2488 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2489 * and return success */
2491 /* This path cannot be hit for regular files unless in
2492 * case of obscure races, so no need to to validate
2494 if (!S_ISREG(inode->i_mode) &&
2495 !S_ISDIR(inode->i_mode))
2500 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2508 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2510 struct inode *inode = dentry->d_inode;
2511 struct ptlrpc_request *req = NULL;
2512 struct obd_export *exp;
2517 CERROR("REPORT THIS LINE TO PETER\n");
2520 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2521 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2522 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2523 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2526 exp = ll_i2mdcexp(inode);
2528 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2529 struct lookup_intent oit = { .it_op = IT_GETATTR };
2530 struct mdc_op_data op_data;
2532 /* Call getattr by fid, so do not provide name at all. */
2533 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
2534 dentry->d_inode, NULL, 0, 0, NULL);
2535 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
2536 /* we are not interested in name
2539 ll_mdc_blocking_ast, 0);
2541 rc = ll_inode_revalidate_fini(inode, rc);
2545 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2547 ll_intent_release(&oit);
2551 /* Unlinked? Unhash dentry, so it is not picked up later by
2552 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2553 here to preserve get_cwd functionality on 2.6.
2555 if (!dentry->d_inode->i_nlink) {
2556 spin_lock(&dcache_lock);
2557 ll_drop_dentry(dentry);
2558 spin_unlock(&dcache_lock);
2561 ll_lookup_finish_locks(&oit, dentry);
2562 } else if (!ll_have_md_lock(dentry->d_inode,
2563 MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
2564 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2566 obd_valid valid = OBD_MD_FLGETATTR;
2569 if (S_ISREG(inode->i_mode)) {
2570 rc = ll_get_max_mdsize(sbi, &ealen);
2573 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2575 ll_inode2fid(&fid, inode);
2576 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
2578 rc = ll_inode_revalidate_fini(inode, rc);
2582 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
2588 /* if object not yet allocated, don't validate size */
2589 if (ll_i2info(inode)->lli_smd == NULL)
2592 /* ll_glimpse_size will prefer locally cached writes if they extend
2594 rc = ll_glimpse_size(inode, 0);
2597 ptlrpc_req_finished(req);
2601 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2602 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2603 struct lookup_intent *it, struct kstat *stat)
2605 struct inode *inode = de->d_inode;
2608 res = ll_inode_revalidate_it(de, it);
2609 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2614 stat->dev = inode->i_sb->s_dev;
2615 stat->ino = inode->i_ino;
2616 stat->mode = inode->i_mode;
2617 stat->nlink = inode->i_nlink;
2618 stat->uid = inode->i_uid;
2619 stat->gid = inode->i_gid;
2620 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2621 stat->atime = inode->i_atime;
2622 stat->mtime = inode->i_mtime;
2623 stat->ctime = inode->i_ctime;
2624 #ifdef HAVE_INODE_BLKSIZE
2625 stat->blksize = inode->i_blksize;
2627 stat->blksize = 1<<inode->i_blkbits;
2630 ll_inode_size_lock(inode, 0);
2631 stat->size = i_size_read(inode);
2632 stat->blocks = inode->i_blocks;
2633 ll_inode_size_unlock(inode, 0);
2637 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2639 struct lookup_intent it = { .it_op = IT_GETATTR };
2641 return ll_getattr_it(mnt, de, &it, stat);
2646 int lustre_check_acl(struct inode *inode, int mask)
2648 #ifdef CONFIG_FS_POSIX_ACL
2649 struct ll_inode_info *lli = ll_i2info(inode);
2650 struct posix_acl *acl;
2654 spin_lock(&lli->lli_lock);
2655 acl = posix_acl_dup(lli->lli_posix_acl);
2656 spin_unlock(&lli->lli_lock);
2661 rc = posix_acl_permission(inode, acl, mask);
2662 posix_acl_release(acl);
2670 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2671 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2673 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2674 inode->i_ino, inode->i_generation, inode, mask);
2676 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2677 return generic_permission(inode, mask, lustre_check_acl);
2680 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2681 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2683 int ll_inode_permission(struct inode *inode, int mask)
2686 int mode = inode->i_mode;
2689 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2690 inode->i_ino, inode->i_generation, inode, mask);
2691 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2693 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2694 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2696 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2698 if (current->fsuid == inode->i_uid) {
2701 if (((mode >> 3) & mask & S_IRWXO) != mask)
2703 rc = lustre_check_acl(inode, mask);
2707 goto check_capabilities;
2711 if (in_group_p(inode->i_gid))
2714 if ((mode & mask & S_IRWXO) == mask)
2718 if (!(mask & MAY_EXEC) ||
2719 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2720 if (capable(CAP_DAC_OVERRIDE))
2723 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2724 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2731 /* -o localflock - only provides locally consistent flock locks */
2732 struct file_operations ll_file_operations = {
2733 .read = ll_file_read,
2734 .write = ll_file_write,
2735 .ioctl = ll_file_ioctl,
2736 .open = ll_file_open,
2737 .release = ll_file_release,
2738 .mmap = ll_file_mmap,
2739 .llseek = ll_file_seek,
2740 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2741 .sendfile = ll_file_sendfile,
2746 struct file_operations ll_file_operations_flock = {
2747 .read = ll_file_read,
2748 .write = ll_file_write,
2749 .ioctl = ll_file_ioctl,
2750 .open = ll_file_open,
2751 .release = ll_file_release,
2752 .mmap = ll_file_mmap,
2753 .llseek = ll_file_seek,
2754 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2755 .sendfile = ll_file_sendfile,
2758 #ifdef HAVE_F_OP_FLOCK
2759 .flock = ll_file_flock,
2761 .lock = ll_file_flock
2764 /* These are for -o noflock - to return ENOSYS on flock calls */
2765 struct file_operations ll_file_operations_noflock = {
2766 .read = ll_file_read,
2767 .write = ll_file_write,
2768 .ioctl = ll_file_ioctl,
2769 .open = ll_file_open,
2770 .release = ll_file_release,
2771 .mmap = ll_file_mmap,
2772 .llseek = ll_file_seek,
2773 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2774 .sendfile = ll_file_sendfile,
2777 #ifdef HAVE_F_OP_FLOCK
2778 .flock = ll_file_noflock,
2780 .lock = ll_file_noflock
2783 struct inode_operations ll_file_inode_operations = {
2784 #ifdef HAVE_VFS_INTENT_PATCHES
2785 .setattr_raw = ll_setattr_raw,
2787 .setattr = ll_setattr,
2788 .truncate = ll_truncate,
2789 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2790 .getattr = ll_getattr,
2792 .revalidate_it = ll_inode_revalidate_it,
2794 .permission = ll_inode_permission,
2795 .setxattr = ll_setxattr,
2796 .getxattr = ll_getxattr,
2797 .listxattr = ll_listxattr,
2798 .removexattr = ll_removexattr,
2801 /* dynamic ioctl number support routins */
2802 static struct llioc_ctl_data {
2803 struct rw_semaphore ioc_sem;
2804 struct list_head ioc_head;
2806 __RWSEM_INITIALIZER(llioc.ioc_sem),
2807 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2812 struct list_head iocd_list;
2813 unsigned int iocd_size;
2814 llioc_callback_t iocd_cb;
2815 unsigned int iocd_count;
2816 unsigned int iocd_cmd[0];
2819 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2822 struct llioc_data *in_data = NULL;
2825 if (cb == NULL || cmd == NULL ||
2826 count > LLIOC_MAX_CMD || count < 0)
2829 size = sizeof(*in_data) + count * sizeof(unsigned int);
2830 OBD_ALLOC(in_data, size);
2831 if (in_data == NULL)
2834 memset(in_data, 0, sizeof(*in_data));
2835 in_data->iocd_size = size;
2836 in_data->iocd_cb = cb;
2837 in_data->iocd_count = count;
2838 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2840 down_write(&llioc.ioc_sem);
2841 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2842 up_write(&llioc.ioc_sem);
2847 void ll_iocontrol_unregister(void *magic)
2849 struct llioc_data *tmp;
2854 down_write(&llioc.ioc_sem);
2855 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2857 unsigned int size = tmp->iocd_size;
2859 list_del(&tmp->iocd_list);
2860 up_write(&llioc.ioc_sem);
2862 OBD_FREE(tmp, size);
2866 up_write(&llioc.ioc_sem);
2868 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2871 EXPORT_SYMBOL(ll_iocontrol_register);
2872 EXPORT_SYMBOL(ll_iocontrol_unregister);
2874 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2875 unsigned int cmd, unsigned long arg, int *rcp)
2877 enum llioc_iter ret = LLIOC_CONT;
2878 struct llioc_data *data;
2879 int rc = -EINVAL, i;
2881 down_read(&llioc.ioc_sem);
2882 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2883 for (i = 0; i < data->iocd_count; i++) {
2884 if (cmd != data->iocd_cmd[i])
2887 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2891 if (ret == LLIOC_STOP)
2894 up_read(&llioc.ioc_sem);