X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fllite%2Fllite_close.c;h=14c76a0f26876f940bd0c635201f714d8ad3cb65;hb=7c56e3c31e230be251a9aaffce4fc0d2d31aa679;hp=c2137812a098a1708a149db75ee27105eada5e37;hpb=7ce2000eb0f4e7b7ea1f362c17099881098cfef7;p=fs%2Flustre-release.git diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index c213781..14c76a0 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -1,177 +1,301 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Lustre Lite routines to issue a secondary close after writeback + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * - * This file is part of Lustre, http://www.lustre.org. + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * lustre/llite/llite_close.c * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * Lustre Lite routines to issue a secondary close after writeback */ #include #define DEBUG_SUBSYSTEM S_LLITE -#include -#include +//#include +#include #include "llite_internal.h" -/* record that a write is in flight */ -void llap_write_pending(struct inode *inode, struct ll_async_page *llap) +/** records that a write is in flight */ +void vvp_write_pending(struct ccc_object *club, struct ccc_page *page) { - struct ll_inode_info *lli = ll_i2info(inode); - spin_lock(&lli->lli_lock); - list_add(&llap->llap_pending_write, &lli->lli_pending_write_llaps); - spin_unlock(&lli->lli_lock); -} + struct ll_inode_info *lli = ll_i2info(club->cob_inode); -/* record that a write has completed */ -void llap_write_complete(struct inode *inode, struct ll_async_page *llap) -{ - struct ll_inode_info *lli = ll_i2info(inode); - spin_lock(&lli->lli_lock); - if (!list_empty(&llap->llap_pending_write)) - list_del_init(&llap->llap_pending_write); - spin_unlock(&lli->lli_lock); -} - -void ll_open_complete(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; spin_lock(&lli->lli_lock); - lli->lli_send_done_writing = 0; + lli->lli_flags |= LLIF_SOM_DIRTY; + if (page != NULL && list_empty(&page->cpg_pending_linkage)) + list_add(&page->cpg_pending_linkage, &club->cob_pending_list); spin_unlock(&lli->lli_lock); + EXIT; } -/* if we close with writes in flight then we want the completion or cancelation - * of those writes to send a DONE_WRITING rpc to the MDS */ -int ll_is_inode_dirty(struct inode *inode) +/** records that a write has completed */ +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page) { - struct ll_inode_info *lli = ll_i2info(inode); + struct ll_inode_info *lli = ll_i2info(club->cob_inode); int rc = 0; - ENTRY; + ENTRY; spin_lock(&lli->lli_lock); - if (!list_empty(&lli->lli_pending_write_llaps)) + if (page != NULL && !list_empty(&page->cpg_pending_linkage)) { + list_del_init(&page->cpg_pending_linkage); rc = 1; + } spin_unlock(&lli->lli_lock); - RETURN(rc); + if (rc) + ll_queue_done_writing(club->cob_inode, 0); + EXIT; } -void ll_try_done_writing(struct inode *inode) +/** Queues DONE_WRITING if + * - done writing is allowed; + * - inode has no no dirty pages; */ +void ll_queue_done_writing(struct inode *inode, unsigned long flags) { struct ll_inode_info *lli = ll_i2info(inode); - struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq; spin_lock(&lli->lli_lock); + lli->lli_flags |= flags; - if (lli->lli_send_done_writing && + if ((lli->lli_flags & LLIF_DONE_WRITING) && list_empty(&lli->lli_pending_write_llaps)) { - + struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq; + + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CWARN("ino %lu/%u(flags %lu) som valid it just after " + "recovery\n", + inode->i_ino, inode->i_generation, + lli->lli_flags); + /* DONE_WRITING is allowed and inode has no dirty page. */ spin_lock(&lcq->lcq_lock); - if (list_empty(&lli->lli_close_item)) { - CDEBUG(D_INODE, "adding inode %lu/%u to close list\n", - inode->i_ino, inode->i_generation); - LASSERT(igrab(inode) == inode); - list_add_tail(&lli->lli_close_item, &lcq->lcq_list); - wake_up(&lcq->lcq_waitq); - } + + LASSERT(list_empty(&lli->lli_close_list)); + CDEBUG(D_INODE, "adding inode %lu/%u to close list\n", + inode->i_ino, inode->i_generation); + list_add_tail(&lli->lli_close_list, &lcq->lcq_head); + + /* Avoid a concurrent insertion into the close thread queue: + * an inode is already in the close thread, open(), write(), + * close() happen, epoch is closed as the inode is marked as + * LLIF_EPOCH_PENDING. When pages are written inode should not + * be inserted into the queue again, clear this flag to avoid + * it. */ + lli->lli_flags &= ~LLIF_DONE_WRITING; + + wake_up(&lcq->lcq_waitq); spin_unlock(&lcq->lcq_lock); } - spin_unlock(&lli->lli_lock); } -/* The MDS needs us to get the real file attributes, then send a DONE_WRITING */ -void ll_queue_done_writing(struct inode *inode) +/** Closes epoch and sends Size-on-MDS attribute update if possible. Call + * this under ll_inode_info::lli_lock spinlock. */ +void ll_epoch_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle **och, unsigned long flags) { struct ll_inode_info *lli = ll_i2info(inode); ENTRY; spin_lock(&lli->lli_lock); - lli->lli_send_done_writing = 1; - spin_unlock(&lli->lli_lock); + if (!(list_empty(&lli->lli_pending_write_llaps))) { + if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) { + LASSERT(*och != NULL); + LASSERT(lli->lli_pending_och == NULL); + /* Inode is dirty and there is no pending write done + * request yet, DONE_WRITE is to be sent later. */ + lli->lli_flags |= LLIF_EPOCH_PENDING; + lli->lli_pending_och = *och; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + GOTO(out, 0); + } + if (flags & LLIF_DONE_WRITING) { + /* Some pages are still dirty, it is early to send + * DONE_WRITE. Wait untill all pages will be flushed + * and try DONE_WRITE again later. */ + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + lli->lli_flags |= LLIF_DONE_WRITING; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + GOTO(out, 0); + } + } + CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n", + ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid)); + op_data->op_flags |= MF_EPOCH_CLOSE; + + if (flags & LLIF_DONE_WRITING) { + LASSERT(lli->lli_flags & LLIF_SOM_DIRTY); + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + *och = lli->lli_pending_och; + lli->lli_pending_och = NULL; + lli->lli_flags &= ~LLIF_EPOCH_PENDING; + } else { + /* Pack Size-on-MDS inode attributes only if they has changed */ + if (!(lli->lli_flags & LLIF_SOM_DIRTY)) { + spin_unlock(&lli->lli_lock); + GOTO(out, 0); + } - ll_try_done_writing(inode); + /* There is a pending DONE_WRITE -- close epoch with no + * attribute change. */ + if (lli->lli_flags & LLIF_EPOCH_PENDING) { + spin_unlock(&lli->lli_lock); + GOTO(out, 0); + } + } + + LASSERT(list_empty(&lli->lli_pending_write_llaps)); + lli->lli_flags &= ~LLIF_SOM_DIRTY; + spin_unlock(&lli->lli_lock); + op_data->op_flags |= MF_SOM_CHANGE; + + /* Check if Size-on-MDS attributes are valid. */ + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CWARN("ino %lu/%u(flags %lu) som valid it just after " + "recovery\n", + inode->i_ino, inode->i_generation, lli->lli_flags); + + if (!cl_local_size(inode)) { + /* Send Size-on-MDS Attributes if valid. Atime is sent along + * with all the attributes. */ + op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET | + ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS; + } EXIT; +out: + return; } -#if 0 -/* If we know the file size and have the cookies: - * - send a DONE_WRITING rpc - * - * Otherwise: - * - get a whole-file lock - * - get the authoritative size and all cookies with GETATTRs - * - send a DONE_WRITING rpc - */ -static void ll_close_done_writing(struct inode *inode) +int ll_sizeonmds_update(struct inode *inode, struct lustre_handle *fh, + __u64 ioepoch) { struct ll_inode_info *lli = ll_i2info(inode); - ldlm_policy_data_t policy = { .l_extent = {0, OBD_OBJECT_EOF } }; - struct lustre_handle lockh = { 0 }; - struct obdo obdo; - obd_flag valid; - int rc, ast_flags = 0; + struct md_op_data *op_data; + struct obdo *oa; + int rc; ENTRY; - memset(&obdo, 0, sizeof(obdo)); - if (test_bit(LLI_F_HAVE_OST_SIZE_LOCK, &lli->lli_flags)) - goto rpc; - - rc = ll_extent_lock(NULL, inode, lli->lli_smd, LCK_PW, &policy, &lockh, - ast_flags); - if (rc != 0) { - CERROR("lock acquisition failed (%d): unable to send " - "DONE_WRITING for inode %lu/%u\n", rc, inode->i_ino, - inode->i_generation); - GOTO(out, rc); + /* LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)); */ + /* After recovery that can be valid. */ + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CWARN("ino %lu/%u(flags %lu) som valid it just after " + "recovery\n", inode->i_ino, inode->i_generation, + lli->lli_flags); + + OBDO_ALLOC(oa); + OBD_ALLOC_PTR(op_data); + if (!oa || !op_data) { + CERROR("can't allocate memory for Size-on-MDS update.\n"); + RETURN(-ENOMEM); } - - rc = ll_lsm_getattr(ll_i2obdexp(inode), lli->lli_smd, &obdo); - if (rc) { - CERROR("inode_getattr failed (%d): unable to send DONE_WRITING " - "for inode %lu/%u\n", rc, inode->i_ino, - inode->i_generation); - ll_extent_unlock(NULL, inode, lli->lli_smd, LCK_PW, &lockh); + rc = ll_inode_getattr(inode, oa); + if (rc == -ENOENT) { + oa->o_valid = 0; + CDEBUG(D_INODE, "objid "LPX64" is already destroyed\n", + lli->lli_smd->lsm_object_id); + } else if (rc) { + CERROR("inode_getattr failed (%d): unable to send a " + "Size-on-MDS attribute update for inode %lu/%u\n", + rc, inode->i_ino, inode->i_generation); GOTO(out, rc); } + CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n", PFID(&lli->lli_fid)); - obdo_refresh_inode(inode, &obdo, valid); + md_from_obdo(op_data, oa, oa->o_valid); + memcpy(&op_data->op_handle, fh, sizeof(*fh)); - CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n", - lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks, - inode->i_blksize); + op_data->op_ioepoch = ioepoch; + op_data->op_flags |= MF_SOM_CHANGE; - set_bit(LLI_F_HAVE_OST_SIZE_LOCK, &lli->lli_flags); + rc = ll_md_setattr(inode, op_data, NULL); + EXIT; +out: + if (oa) + OBDO_FREE(oa); + if (op_data) + ll_finish_md_op_data(op_data); + return rc; +} + +/** Sends a DONE_WRITING rpc, packs Size-on-MDS attributes into it, if + * possible */ +static void ll_done_writing(struct inode *inode) +{ + struct obd_client_handle *och = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; - rc = ll_extent_unlock(NULL, inode, lli->lli_smd, LCK_PW, &lockh); - if (rc != ELDLM_OK) - CERROR("unlock failed (%d)? proceeding anyways...\n", rc); + LASSERT(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM); - rpc: - obdo.o_id = inode->i_ino; - obdo.o_size = inode->i_size; - obdo.o_blocks = inode->i_blocks; - obdo.o_valid = OBD_MD_FLID | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + CERROR("can't allocate op_data\n"); + EXIT; + return; + } - rc = mdc_done_writing(ll_i2sbi(inode)->ll_mdc_exp, &obdo); - out: + ll_epoch_close(inode, op_data, &och, LLIF_DONE_WRITING); + /* If there is no @och, we do not do D_W yet. */ + if (och == NULL) + GOTO(out, 0); + + ll_pack_inode2opdata(inode, op_data, &och->och_fh); + + rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL); + if (rc == -EAGAIN) { + /* MDS has instructed us to obtain Size-on-MDS attribute from + * OSTs and send setattr to back to MDS. */ + rc = ll_sizeonmds_update(inode, &och->och_fh, + op_data->op_ioepoch); + } else if (rc) { + CERROR("inode %lu mdc done_writing failed: rc = %d\n", + inode->i_ino, rc); + } +out: + ll_finish_md_op_data(op_data); + if (och) { + md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och); + OBD_FREE_PTR(och); + } + EXIT; } -#endif static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq) { @@ -179,13 +303,12 @@ static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq) spin_lock(&lcq->lcq_lock); - if (lcq->lcq_list.next == NULL) - lli = ERR_PTR(-1); - else if (!list_empty(&lcq->lcq_list)) { - lli = list_entry(lcq->lcq_list.next, struct ll_inode_info, - lli_close_item); - list_del(&lli->lli_close_item); - } + if (!list_empty(&lcq->lcq_head)) { + lli = list_entry(lcq->lcq_head.next, struct ll_inode_info, + lli_close_list); + list_del_init(&lli->lli_close_list); + } else if (atomic_read(&lcq->lcq_stop)) + lli = ERR_PTR(-EALREADY); spin_unlock(&lcq->lcq_lock); return lli; @@ -196,16 +319,10 @@ static int ll_close_thread(void *arg) struct ll_close_queue *lcq = arg; ENTRY; - /* XXX boiler-plate */ { - char name[sizeof(current->comm)]; - unsigned long flags; + char name[CFS_CURPROC_COMM_MAX]; snprintf(name, sizeof(name) - 1, "ll_close"); - kportal_daemonize(name); - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); + cfs_daemonize(name); } complete(&lcq->lcq_comp); @@ -213,7 +330,7 @@ static int ll_close_thread(void *arg) while (1) { struct l_wait_info lwi = { 0 }; struct ll_inode_info *lli; - //struct inode *inode; + struct inode *inode; l_wait_event_exclusive(lcq->lcq_waitq, (lli = ll_close_next_lli(lcq)) != NULL, @@ -221,11 +338,14 @@ static int ll_close_thread(void *arg) if (IS_ERR(lli)) break; - //inode = ll_info2i(lli); - //ll_close_done_writing(inode); - //iput(inode); + inode = ll_info2i(lli); + CDEBUG(D_INFO, "done_writting for inode %lu/%u\n", + inode->i_ino, inode->i_generation); + ll_done_writing(inode); + iput(inode); } + CDEBUG(D_INFO, "ll_close exiting\n"); complete(&lcq->lcq_comp); RETURN(0); } @@ -235,12 +355,15 @@ int ll_close_thread_start(struct ll_close_queue **lcq_ret) struct ll_close_queue *lcq; pid_t pid; + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD)) + return -EINTR; + OBD_ALLOC(lcq, sizeof(*lcq)); if (lcq == NULL) return -ENOMEM; spin_lock_init(&lcq->lcq_lock); - INIT_LIST_HEAD(&lcq->lcq_list); + INIT_LIST_HEAD(&lcq->lcq_head); init_waitqueue_head(&lcq->lcq_waitq); init_completion(&lcq->lcq_comp); @@ -258,7 +381,7 @@ int ll_close_thread_start(struct ll_close_queue **lcq_ret) void ll_close_thread_shutdown(struct ll_close_queue *lcq) { init_completion(&lcq->lcq_comp); - lcq->lcq_list.next = NULL; + atomic_inc(&lcq->lcq_stop); wake_up(&lcq->lcq_waitq); wait_for_completion(&lcq->lcq_comp); OBD_FREE(lcq, sizeof(*lcq));