X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fllite%2Fllite_mmap.c;h=3a0c07744053bcb2038102003e119b9a2849fa01;hb=77e2f09e3ba863018ff373e198ac0e62487aa675;hp=c95566101f9ebefc8527159fd3dc84e20df07f3a;hpb=84a795f90412ee575703476728f1522c06de4a2c;p=fs%2Flustre-release.git diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index c955661..3a0c077 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -28,24 +26,21 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. */ -#ifndef AUTOCONF_INCLUDED -#include -#endif #include #include #include #include #include -#include #include #include -#include #include #include @@ -53,21 +48,15 @@ #include #include #include -#include #define DEBUG_SUBSYSTEM S_LLITE -//#include #include #include "llite_internal.h" #include -#define VMA_DEBUG(vma, fmt, arg...) \ - CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) " \ - "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end, \ - vma->vm_pgoff, vma->vm_file->f_dentry->d_inode, \ - vma->vm_file->f_dentry->d_inode->i_ino, \ - vma->vm_file->f_dentry->d_iname, ## arg); \ +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type); static struct vm_operations_struct ll_file_vm_ops; @@ -81,17 +70,15 @@ void policy_from_vma(ldlm_policy_data_t *policy, ~CFS_PAGE_MASK; } -struct vm_area_struct * our_vma(unsigned long addr, size_t count) +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *ret = NULL; ENTRY; - /* No MM (e.g. NFS)? No vmas too. */ - if (!mm) - RETURN(NULL); + /* mmap_sem must have been held by caller. */ + LASSERT(!down_write_trylock(&mm->mmap_sem)); - spin_lock(&mm->page_table_lock); for(vma = find_vma(mm, addr); vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && @@ -100,7 +87,6 @@ struct vm_area_struct * our_vma(unsigned long addr, size_t count) break; } } - spin_unlock(&mm->page_table_lock); RETURN(ret); } @@ -116,19 +102,21 @@ struct vm_area_struct * our_vma(unsigned long addr, size_t count) * \retval EINVAL if env can't allocated * \return other error codes from cl_io_init. */ -int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, - struct cl_env_nest *nest, pgoff_t index, unsigned long *ra_flags) +struct cl_io *ll_fault_io_init(struct vm_area_struct *vma, + struct lu_env **env_ret, + struct cl_env_nest *nest, + pgoff_t index, unsigned long *ra_flags) { struct file *file = vma->vm_file; struct inode *inode = file->f_dentry->d_inode; - const unsigned long writable = VM_SHARED|VM_WRITE; struct cl_io *io; struct cl_fault_io *fio; struct lu_env *env; ENTRY; + *env_ret = NULL; if (ll_file_nolock(file)) - RETURN(-EOPNOTSUPP); + RETURN(ERR_PTR(-EOPNOTSUPP)); /* * page fault can be called when lustre IO is @@ -138,20 +126,17 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, * one. */ env = cl_env_nested_get(nest); - if (IS_ERR(env)) { - *env_ret = NULL; - RETURN(-EINVAL); - } + if (IS_ERR(env)) + RETURN(ERR_PTR(-EINVAL)); *env_ret = env; - io = &ccc_env_info(env)->cti_io; + io = ccc_env_thread_io(env); io->ci_obj = ll_i2info(inode)->lli_clob; LASSERT(io->ci_obj != NULL); fio = &io->u.ci_fault; fio->ft_index = index; - fio->ft_writable = (vma->vm_flags&writable) == writable; fio->ft_executable = vma->vm_flags&VM_EXEC; /* @@ -159,12 +144,13 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, * the kernel will not read other pages not covered by ldlm in * filemap_nopage. we do our readahead in ll_readpage. */ - *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + if (ra_flags != NULL) + *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); vma->vm_flags &= ~VM_SEQ_READ; vma->vm_flags |= VM_RAND_READ; - CDEBUG(D_INFO, "vm_flags: %lx (%lu %d %d)\n", vma->vm_flags, - fio->ft_index, fio->ft_writable, fio->ft_executable); + CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, + fio->ft_index, fio->ft_executable); if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) { struct ccc_io *cio = ccc_env_io(env); @@ -179,9 +165,116 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, cio->cui_fd = fd; } - return io->ci_result; + return io; +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_env_nest nest; + int result; + cfs_sigset_t set; + struct inode *inode; + struct ll_inode_info *lli; + ENTRY; + + LASSERT(vmpage != NULL); + + io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result < 0) + GOTO(out, result); + + /* Don't enqueue new locks for page_mkwrite(). + * If the lock has been cancelled then page must have been + * truncated, in that case, kernel will handle it. + */ + io->ci_lockreq = CILR_PEEK; + io->u.ci_fault.ft_mkwrite = 1; + io->u.ci_fault.ft_writable = 1; + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + + /* we grab lli_trunc_sem to exclude truncate case. + * Otherwise, we could add dirty pages into osc cache + * while truncate is on-going. */ + inode = ccc_object_inode(io->ci_obj); + lli = ll_i2info(inode); + cfs_down_read(&lli->lli_trunc_sem); + + result = cl_io_loop(env, io); + + cfs_up_read(&lli->lli_trunc_sem); + + cfs_restore_sigs(set); + + if (result == -ENODATA) /* peek failed, no lock caching. */ + CDEBUG(D_MMAP, "race on page_mkwrite: %lx (%lu %p)\n", + vma->vm_flags, io->u.ci_fault.ft_index, vmpage); + + if (result == 0 || result == -ENODATA) { + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (result == -ENODATA) { + /* Invalidate it if the cl_lock is being revoked. + * This piece of code is definitely needed for RHEL5, + * otherwise, SIGBUS will be wrongly returned to + * applications. */ + write_one_page(vmpage, 1); + lock_page(vmpage); + if (vmpage->mapping != NULL) { + ll_invalidate_page(vmpage); + LASSERT(vmpage->mapping == NULL); + } + unlock_page(vmpage); + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " + "been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + } + EXIT; + +out: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + CDEBUG(D_MMAP, "%s mkwrite with %d\n", cfs_current()->comm, result); + + LASSERT(ergo(result == 0, PageLocked(vmpage))); + return(result); } + #ifndef HAVE_VM_OP_FAULT /** * Lustre implementation of a vm_operations_struct::nopage() method, called by @@ -203,43 +296,45 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, struct lu_env *env; struct cl_env_nest nest; struct cl_io *io; - struct page *page; - struct vvp_io *vio; + struct page *page = NOPAGE_SIGBUS; + struct vvp_io *vio = NULL; unsigned long ra_flags; pgoff_t pg_offset; int result; + const unsigned long writable = VM_SHARED|VM_WRITE; + cfs_sigset_t set; ENTRY; pg_offset = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - result = ll_fault_io_init(vma, &env, &nest, pg_offset, &ra_flags); - if (env == NULL) + io = ll_fault_io_init(vma, &env, &nest, pg_offset, &ra_flags); + if (IS_ERR(io)) return NOPAGE_SIGBUS; - io = &ccc_env_info(env)->cti_io; + result = io->ci_result; if (result < 0) goto out_err; + io->u.ci_fault.ft_writable = (vma->vm_flags&writable) == writable; + vio = vvp_env_io(env); vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = NULL; vio->u.fault.nopage.ft_address = address; vio->u.fault.nopage.ft_type = type; + vio->u.fault.ft_vmpage = NULL; - result = cl_io_loop(env, io); + set = cfs_block_sigsinv(sigmask(SIGKILL)|sigmask(SIGTERM)); + result = cl_io_loop(env, io); + cfs_restore_sigs(set); - page = vio->u.fault.ft_vmpage; - if (page != NULL) { - LASSERT(PageLocked(page)); - unlock_page(page); + page = vio->u.fault.ft_vmpage; + if (result != 0 && page != NULL) { + page_cache_release(page); + page = NOPAGE_SIGBUS; + } - if (result != 0) - page_cache_release(page); - } - - LASSERT(ergo(result == 0, io->u.ci_fault.ft_page != NULL)); out_err: - if (result != 0) - page = result == -ENOMEM ? NOPAGE_OOM : NOPAGE_SIGBUS; + if (result == -ENOMEM) + page = NOPAGE_OOM; vma->vm_flags &= ~VM_RAND_READ; vma->vm_flags |= ra_flags; @@ -249,7 +344,28 @@ out_err: RETURN(page); } + #else + +static inline int to_fault_error(int result) +{ + switch(result) { + case 0: + result = VM_FAULT_LOCKED; + break; + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} + /** * Lustre implementation of a vm_operations_struct::fault() method, called by * VM to server page fault (both in kernel and user space). @@ -261,54 +377,164 @@ out_err: * \retval VM_FAULT_ERROR on general error * \retval NOPAGE_OOM not have memory for allocate new page */ -int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) { struct lu_env *env; struct cl_io *io; - struct vvp_io *vio; + struct vvp_io *vio = NULL; + struct page *vmpage; unsigned long ra_flags; struct cl_env_nest nest; int result; int fault_ret = 0; ENTRY; - result = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); - if (env == NULL) - RETURN(VM_FAULT_ERROR); + io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); + if (IS_ERR(io)) + RETURN(to_fault_error(PTR_ERR(io))); + + result = io->ci_result; + if (result == 0) { + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = NULL; + vio->u.fault.fault.ft_vmf = vmf; + + result = cl_io_loop(env, io); + + fault_ret = vio->u.fault.fault.ft_flags; + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + page_cache_release(vmpage); + vmf->page = NULL; + } + } + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); - io = &ccc_env_info(env)->cti_io; - if (result < 0) - goto out_err; + vma->vm_flags |= ra_flags; + if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) + fault_ret |= to_fault_error(result); - vio = vvp_env_io(env); - vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = NULL; - vio->u.fault.fault.ft_vmf = vmf; + CDEBUG(D_MMAP, "%s fault %d/%d\n", + cfs_current()->comm, fault_ret, result); + RETURN(fault_ret); +} - result = cl_io_loop(env, io); - if (unlikely(result != 0 && vio->u.fault.ft_vmpage != NULL)) { - struct page *vmpage = vio->u.fault.ft_vmpage; +static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + int result; + cfs_sigset_t set; + + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite + * so that it can be killed by admin but not cause segfault by + * other signals. */ + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + +restart: + result = ll_fault0(vma, vmf); + LASSERT(!(result & VM_FAULT_LOCKED)); + if (result == 0) { + struct page *vmpage = vmf->page; + + /* check if this page has been truncated */ + lock_page(vmpage); + if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ + unlock_page(vmpage); + page_cache_release(vmpage); + vmf->page = NULL; + + if (!printed && ++count > 16) { + CWARN("the page is under heavy contention," + "maybe your app(%s) needs revising :-)\n", + current->comm); + printed = true; + } + + goto restart; + } - LASSERT((vio->u.fault.fault.ft_flags & VM_FAULT_LOCKED) && - PageLocked(vmpage)); - unlock_page(vmpage); - page_cache_release(vmpage); - vmf->page = NULL; + result |= VM_FAULT_LOCKED; } + cfs_restore_sigs(set); + return result; +} +#endif - fault_ret = vio->u.fault.fault.ft_flags; -out_err: - if (result != 0) - fault_ret |= VM_FAULT_ERROR; - - vma->vm_flags |= ra_flags; +#ifndef HAVE_PGMKWRITE_USE_VMFAULT +static int ll_page_mkwrite(struct vm_area_struct *vma, struct page *vmpage) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmpage, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy" + " contention.\n", + current->comm, page_index(vmpage), + vma->vm_file->f_dentry->d_inode->i_ino); + printed = true; + } + } while (retry); - cl_io_fini(env, io); - cl_env_nested_put(&nest, env); + if (result == 0) + unlock_page(vmpage); + else if (result == -ENODATA) + result = 0; /* kernel will know truncate has happened and + * retry */ - RETURN(fault_ret); + return result; } +#else +static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy" + " contention.\n", + current->comm, vmf->pgoff, + vma->vm_file->f_dentry->d_inode->i_ino); + printed = true; + } + } while (retry); + + switch(result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} #endif /** @@ -385,14 +611,18 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) static struct vm_operations_struct ll_file_vm_ops = { #ifndef HAVE_VM_OP_FAULT - .nopage = ll_nopage, - .populate = ll_populate, - + .nopage = ll_nopage, + .populate = ll_populate, +#else + .fault = ll_fault, +#endif +#ifndef HAVE_PGMKWRITE_COMPACT + .page_mkwrite = ll_page_mkwrite, #else - .fault = ll_fault, + ._pmkw.page_mkwrite = ll_page_mkwrite, #endif - .open = ll_vm_open, - .close = ll_vm_close, + .open = ll_vm_open, + .close = ll_vm_close, }; int ll_file_mmap(struct file *file, struct vm_area_struct * vma) @@ -414,7 +644,7 @@ int ll_file_mmap(struct file *file, struct vm_area_struct * vma) vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ - rc = cl_glimpse_size(inode); + rc = ll_glimpse_size(inode); } RETURN(rc);