Whamcloud - gitweb
b=15908
[fs/lustre-release.git] / lustre / llite / llite_lib.c
index 43651c6..0c32ba0 100644 (file)
@@ -35,6 +35,7 @@
 #include <lustre_disk.h>
 #include <lustre_param.h>
 #include <lustre_log.h>
+#include <obd_cksum.h>
 #include "llite_internal.h"
 
 cfs_mem_cache_t *ll_file_data_slab;
@@ -75,7 +76,8 @@ static struct ll_sb_info *ll_init_sbi(void)
                                            SBI_DEFAULT_READAHEAD_MAX);
         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
-
+        sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
+        sbi->ll_lockless_truncate_enable = SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE;
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
 
@@ -104,6 +106,9 @@ static struct ll_sb_info *ll_init_sbi(void)
                 spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].pp_w_hist.oh_lock);
         }
 
+        /* metadata statahead is enabled by default */
+        sbi->ll_sa_max = LL_SA_RPC_DEF;
+
         RETURN(sbi);
 }
 
@@ -192,7 +197,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                 err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
                                                   dt, md);
                 if (err < 0)
-                        CERROR("could not register mount in /proc/lustre");
+                        CERROR("could not register mount in /proc/fs/lustre\n");
         }
 
         /* indicate the features supported by this client */
@@ -239,7 +244,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                 data->ocd_connect_flags |= OBD_CONNECT_LCL_CLIENT;
         }
 
-        err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, data);
+        err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, data, NULL);
         if (err == -EBUSY) {
                 LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
                                    "recovery, of which this client is not a "
@@ -364,7 +369,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
 
         data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
                                   OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
-                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID;
+                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+                                  OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK;
         if (sbi->ll_flags & LL_SBI_OSS_CAPA)
                 data->ocd_connect_flags |= OBD_CONNECT_OSS_CAPA;
 
@@ -393,7 +399,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
         obd->obd_upcall.onu_upcall = ll_ocd_update;
         data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT;
 
-        err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, data);
+        err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, data, NULL);
         if (err == -EBUSY) {
                 LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
                                    "recovery, of which this client is not a "
@@ -1102,6 +1108,13 @@ void ll_clear_inode(struct inode *inode)
         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
                inode->i_generation, inode);
 
+        if (S_ISDIR(inode->i_mode)) {
+                /* these should have been cleared in ll_file_release */
+                LASSERT(lli->lli_sai == NULL);
+                LASSERT(lli->lli_opendir_key == NULL);
+                LASSERT(lli->lli_opendir_pid == 0);
+        }
+
         ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
         md_change_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
                          null_if_equal, inode);
@@ -1243,6 +1256,92 @@ static int ll_setattr_done_writing(struct inode *inode,
         RETURN(rc);
 }
 
+static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        int rc;
+        ldlm_policy_data_t policy = { .l_extent = {new_size,
+                                                   OBD_OBJECT_EOF } };
+        struct lustre_handle lockh = { 0 };
+        int local_lock = 0; /* 0 - no local lock;
+                             * 1 - lock taken by lock_extent;
+                             * 2 - by obd_match*/
+        int ast_flags;
+        int err;
+        ENTRY;
+
+        UNLOCK_INODE_MUTEX(inode);
+        UP_WRITE_I_ALLOC_SEM(inode);
+
+        if (sbi->ll_lockless_truncate_enable &&
+            (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) {
+                ast_flags = LDLM_FL_BLOCK_GRANTED;
+                rc = obd_match(sbi->ll_dt_exp, lsm, LDLM_EXTENT,
+                               &policy, LCK_PW, &ast_flags, inode, &lockh);
+                if (rc > 0) {
+                        local_lock = 2;
+                        rc = 0;
+                } else if (rc == 0) {
+                        rc = ll_file_punch(inode, new_size, 1);
+                }
+        } else {
+                /* XXX when we fix the AST intents to pass the discard-range
+                 * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
+                 * XXX here. */
+                ast_flags = (new_size == 0) ? LDLM_AST_DISCARD_DATA : 0;
+                rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy,
+                                    &lockh, ast_flags);
+                if (likely(rc == 0))
+                        local_lock = 1;
+        }
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        DOWN_WRITE_I_ALLOC_SEM(inode);
+        LOCK_INODE_MUTEX(inode);
+#else
+        LOCK_INODE_MUTEX(inode);
+        DOWN_WRITE_I_ALLOC_SEM(inode);
+#endif
+        if (likely(rc == 0)) {
+                /* Only ll_inode_size_lock is taken at this level.
+                 * lov_stripe_lock() is grabbed by ll_truncate() only over
+                 * call to obd_adjust_kms().  If vmtruncate returns 0, then
+                 * ll_truncate dropped ll_inode_size_lock() */
+                ll_inode_size_lock(inode, 0);
+                if (!local_lock) {
+                        spin_lock(&lli->lli_lock);
+                        lli->lli_flags |= LLIF_SRVLOCK;
+                        spin_unlock(&lli->lli_lock);
+                }
+                rc = vmtruncate(inode, new_size);
+                if (!local_lock) {
+                        spin_lock(&lli->lli_lock);
+                        lli->lli_flags &= ~LLIF_SRVLOCK;
+                        spin_unlock(&lli->lli_lock);
+                }
+                if (rc != 0) {
+                        LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
+                        ll_inode_size_unlock(inode, 0);
+                }
+        }
+
+        if (local_lock) {
+                if (local_lock == 2)
+                        err = obd_cancel(sbi->ll_dt_exp, lsm, LCK_PW, &lockh);
+                else
+                        err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
+                if (unlikely(err != 0)){
+                        CERROR("extent unlock failed: err=%d,"
+                               " unlock method =%d\n", err, local_lock);
+                        if (rc == 0)
+                                rc = err;
+                }
+        }
+        RETURN(rc);
+}
+
 /* If this inode has objects allocated to it (lsm != NULL), then the OST
  * object(s) determine the file size and mtime.  Otherwise, the MDS will
  * keep these values until such a time that objects are allocated for it.
@@ -1314,7 +1413,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
                 CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
-                       CURRENT_SECONDS);
+                       cfs_time_current_sec());
 
         /* NB: ATTR_SIZE will only be set after this point if the size
          * resides on the MDS, ie, this file has no objects. */
@@ -1355,43 +1454,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
          * last one is especially bad for racing o_append users on other
          * nodes. */
         if (ia_valid & ATTR_SIZE) {
-                ldlm_policy_data_t policy = { .l_extent = {attr->ia_size,
-                                                           OBD_OBJECT_EOF } };
-                struct lustre_handle lockh = { 0 };
-                int err, ast_flags = 0;
-                /* XXX when we fix the AST intents to pass the discard-range
-                 * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
-                 * XXX here. */
-                if (attr->ia_size == 0)
-                        ast_flags = LDLM_AST_DISCARD_DATA;
-
-                UNLOCK_INODE_MUTEX(inode);
-                UP_WRITE_I_ALLOC_SEM(inode);
-                rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh,
-                                    ast_flags);
-                LOCK_INODE_MUTEX(inode);
-                DOWN_WRITE_I_ALLOC_SEM(inode);
-
-                if (rc != 0)
-                        GOTO(out, rc);
-
-                /* Only ll_inode_size_lock is taken at this level.
-                 * lov_stripe_lock() is grabbed by ll_truncate() only over
-                 * call to obd_adjust_kms().  If vmtruncate returns 0, then
-                 * ll_truncate dropped ll_inode_size_lock() */
-                ll_inode_size_lock(inode, 0);
-                rc = vmtruncate(inode, attr->ia_size);
-                if (rc != 0) {
-                        LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
-                        ll_inode_size_unlock(inode, 0);
-                }
-
-                err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
-                if (err) {
-                        CERROR("ll_extent_unlock failed: %d\n", err);
-                        if (!rc)
-                                rc = err;
-                }
+                rc = ll_setattr_do_truncate(inode, attr->ia_size);
         } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
                 obd_flag flags;
                 struct obd_info oinfo = { { { 0 } } };
@@ -1644,6 +1707,8 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                 spin_unlock(&lli->lli_lock);
         }
 #endif
+        inode->i_ino = ll_fid_build_ino(sbi, &body->fid1);
+
         if (body->valid & OBD_MD_FLATIME &&
             body->atime > LTIME_S(inode->i_atime))
                 LTIME_S(inode->i_atime) = body->atime;
@@ -1818,6 +1883,7 @@ void ll_delete_inode(struct inode *inode)
         if (rc) {
                 CERROR("fid_delete() failed, rc %d\n", rc);
         }
+        truncate_inode_pages(&inode->i_data, 0);
         clear_inode(inode);
 
         EXIT;
@@ -2071,6 +2137,7 @@ char *llap_origins[] = {
         [LLAP_ORIGIN_READAHEAD] = "ra",
         [LLAP_ORIGIN_COMMIT_WRITE] = "cw",
         [LLAP_ORIGIN_WRITEPAGE] = "wp",
+        [LLAP_ORIGIN_LOCKLESS_IO] = "ls"
 };
 
 struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
@@ -2182,12 +2249,13 @@ struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
                 op_data->op_capa2 = ll_mdscapa_get(i2);
         } else {
                 fid_zero(&op_data->op_fid2);
+                op_data->op_capa2 = NULL;
         }
 
         op_data->op_name = name;
         op_data->op_namelen = namelen;
         op_data->op_mode = mode;
-        op_data->op_mod_time = CURRENT_SECONDS;
+        op_data->op_mod_time = cfs_time_current_sec();
         op_data->op_fsuid = current->fsuid;
         op_data->op_fsgid = current->fsgid;
         op_data->op_cap = current->cap_effective;