Whamcloud - gitweb
LU-3270 statahead: use dcache-like interface for sa entry
[fs/lustre-release.git] / lustre / llite / llite_internal.h
index ab1a43d..204c42e 100644 (file)
 #include <lustre_ver.h>
 #include <lustre_disk.h>  /* for s2sbi */
 #include <lustre_eacl.h>
+#include <lustre_linkea.h>
 
 /* for struct cl_lock_descr and struct cl_io */
 #include <cl_object.h>
 #include <lclient.h>
 #include <lustre_lmv.h>
 #include <lustre_mdc.h>
-#include <linux/lustre_intent.h>
+#include <lustre_intent.h>
 #include <linux/compat.h>
 
+#include "range_lock.h"
+
 #ifndef FMODE_EXEC
 #define FMODE_EXEC 0
 #endif
@@ -67,6 +70,9 @@
 #define LL_DIR_END_OFF          0x7fffffffffffffffULL
 #define LL_DIR_END_OFF_32BIT    0x7fffffffUL
 
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS 22
+
 #define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
 #define LUSTRE_FPRIVATE(file) ((file)->private_data)
 
@@ -135,34 +141,33 @@ struct ll_inode_info {
        struct hlist_head               *lli_remote_perms;
        struct mutex                    lli_rmtperm_mutex;
 
-        /* identifying fields for both metadata and data stacks. */
-        struct lu_fid                   lli_fid;
-        /* Parent fid for accessing default stripe data on parent directory
-         * for allocating OST objects after a mknod() and later open-by-FID. */
-        struct lu_fid                   lli_pfid;
+       /* identifying fields for both metadata and data stacks. */
+       struct lu_fid                   lli_fid;
+       /* master inode fid for stripe directory */
+       struct lu_fid                   lli_pfid;
 
        struct list_head                lli_close_list;
        struct list_head                lli_oss_capas;
        /* open count currently used by capability only, indicate whether
         * capability needs renewal */
-       atomic_t                    lli_open_count;
-       struct obd_capa                *lli_mds_capa;
-       cfs_time_t                      lli_rmtperm_time;
-
-        /* handle is to be sent to MDS later on done_writing and setattr.
-         * Open handle data are needed for the recovery to reconstruct
-         * the inode state on the MDS. XXX: recovery is not ready yet. */
-        struct obd_client_handle       *lli_pending_och;
-
-        /* We need all three because every inode may be opened in different
-         * modes */
-        struct obd_client_handle       *lli_mds_read_och;
-        struct obd_client_handle       *lli_mds_write_och;
-        struct obd_client_handle       *lli_mds_exec_och;
-        __u64                           lli_open_fd_read_count;
-        __u64                           lli_open_fd_write_count;
-        __u64                           lli_open_fd_exec_count;
-        /* Protects access to och pointers and their usage counters */
+       atomic_t                        lli_open_count;
+       struct obd_capa                *lli_mds_capa;
+       cfs_time_t                      lli_rmtperm_time;
+
+       /* handle is to be sent to MDS later on done_writing and setattr.
+        * Open handle data are needed for the recovery to reconstruct
+        * the inode state on the MDS. XXX: recovery is not ready yet. */
+       struct obd_client_handle       *lli_pending_och;
+
+       /* We need all three because every inode may be opened in different
+        * modes */
+       struct obd_client_handle       *lli_mds_read_och;
+       struct obd_client_handle       *lli_mds_write_och;
+       struct obd_client_handle       *lli_mds_exec_och;
+       __u64                           lli_open_fd_read_count;
+       __u64                           lli_open_fd_write_count;
+       __u64                           lli_open_fd_exec_count;
+       /* Protects access to och pointers and their usage counters */
        struct mutex                    lli_och_mutex;
 
        struct inode                    lli_vfs_inode;
@@ -179,18 +184,24 @@ struct ll_inode_info {
                        /* serialize normal readdir and statahead-readdir. */
                        struct mutex                    d_readdir_mutex;
 
-                        /* metadata statahead */
-                        /* since parent-child threads can share the same @file
-                         * struct, "opendir_key" is the token when dir close for
-                         * case of parent exit before child -- it is me should
-                         * cleanup the dir readahead. */
-                        void                           *d_opendir_key;
-                        struct ll_statahead_info       *d_sai;
-                        /* protect statahead stuff. */
+                       /* metadata statahead */
+                       /* since parent-child threads can share the same @file
+                        * struct, "opendir_key" is the token when dir close for
+                        * case of parent exit before child -- it is me should
+                        * cleanup the dir readahead. */
+                       void                           *d_opendir_key;
+                       struct ll_statahead_info       *d_sai;
+                       /* protect statahead stuff. */
                        spinlock_t                      d_sa_lock;
                        /* "opendir_pid" is the token when lookup/revalid
                         * -- I am the owner of dir statahead. */
-                       pid_t                           d_opendir_pid;
+                       pid_t                           d_opendir_pid;
+                       /* stat will try to access statahead entries or start
+                        * statahead if this flag is set, and this flag will be
+                        * set upon dir open, and cleared when dir is closed,
+                        * statahead hit ratio is too low, or start statahead
+                        * thread failed. */
+                       unsigned int                    d_sa_enabled:1;
                        /* directory stripe information */
                        struct lmv_stripe_md            *d_lsm_md;
                        /* striped directory size */
@@ -203,6 +214,7 @@ struct ll_inode_info {
 #define lli_opendir_key         u.d.d_opendir_key
 #define lli_sai                 u.d.d_sai
 #define lli_sa_lock             u.d.d_sa_lock
+#define lli_sa_enabled         u.d.d_sa_enabled
 #define lli_opendir_pid         u.d.d_opendir_pid
 #define lli_lsm_md             u.d.d_lsm_md
 #define lli_stripe_dir_size    u.d.d_stripe_size
@@ -221,7 +233,7 @@ struct ll_inode_info {
                         * }
                         */
                        struct rw_semaphore             f_trunc_sem;
-                       struct mutex                    f_write_mutex;
+                       struct range_lock_tree          f_write_tree;
 
                        struct rw_semaphore             f_glimpse_sem;
                        cfs_time_t                      f_glimpse_time;
@@ -239,14 +251,14 @@ struct ll_inode_info {
                         * so the read/write statistics for jobid will not be
                         * accurate if the file is shared by different jobs.
                         */
-                       char                     f_jobid[JOBSTATS_JOBID_SIZE];
+                       char                     f_jobid[LUSTRE_JOBID_SIZE];
                } f;
 
 #define lli_size_mutex          u.f.f_size_mutex
 #define lli_symlink_name        u.f.f_symlink_name
 #define lli_maxbytes            u.f.f_maxbytes
 #define lli_trunc_sem           u.f.f_trunc_sem
-#define lli_write_mutex         u.f.f_write_mutex
+#define lli_write_tree          u.f.f_write_tree
 #define lli_glimpse_sem        u.f.f_glimpse_sem
 #define lli_glimpse_time       u.f.f_glimpse_time
 #define lli_agl_list           u.f.f_agl_list
@@ -483,6 +495,20 @@ struct eacl_table {
        struct list_head        et_entries[EE_HASHES];
 };
 
+
+/* This is embedded into llite super-blocks to keep track of connect
+ * flags (capabilities) supported by all imports given mount is
+ * connected to. */
+struct lustre_client_ocd {
+       /* This is conjunction of connect_flags across all imports
+        * (LOVs) this mount is connected to. This field is updated by
+        * cl_ocd_update() under ->lco_lock. */
+       __u64                    lco_flags;
+       struct mutex             lco_lock;
+       struct obd_export       *lco_md_exp;
+       struct obd_export       *lco_dt_exp;
+};
+
 struct ll_sb_info {
        struct list_head                  ll_list;
        /* this protects pglist and ra_info.  It isn't safe to
@@ -540,20 +566,22 @@ struct ll_sb_info {
         int                       ll_rw_stats_on;
 
        /* metadata stat-ahead */
-       unsigned int              ll_sa_max;     /* max statahead RPCs */
-       atomic_t                  ll_sa_total;   /* statahead thread started
+       unsigned int              ll_sa_max;     /* max statahead RPCs */
+       atomic_t                  ll_sa_total;   /* statahead thread started
                                                  * count */
-       atomic_t                  ll_sa_wrong;   /* statahead thread stopped for
+       atomic_t                  ll_sa_wrong;   /* statahead thread stopped for
                                                  * low hit ratio */
-       atomic_t                  ll_agl_total;  /* AGL thread started count */
+       atomic_t                  ll_sa_running; /* running statahead thread
+                                                 * count */
+       atomic_t                  ll_agl_total;  /* AGL thread started count */
 
-       dev_t                     ll_sdev_orig; /* save s_dev before assign for
+       dev_t                     ll_sdev_orig; /* save s_dev before assign for
                                                 * clustred nfs */
-       struct rmtacl_ctl_table   ll_rct;
-       struct eacl_table         ll_et;
+       struct rmtacl_ctl_table   ll_rct;
+       struct eacl_table         ll_et;
 
        /* root squash */
-       struct root_squash_info   ll_squash;
+       struct root_squash_info   ll_squash;
 };
 
 #define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
@@ -715,8 +743,57 @@ static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
 static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
 #endif
 
+enum {
+       LPROC_LL_DIRTY_HITS,
+       LPROC_LL_DIRTY_MISSES,
+       LPROC_LL_READ_BYTES,
+       LPROC_LL_WRITE_BYTES,
+       LPROC_LL_BRW_READ,
+       LPROC_LL_BRW_WRITE,
+       LPROC_LL_OSC_READ,
+       LPROC_LL_OSC_WRITE,
+       LPROC_LL_IOCTL,
+       LPROC_LL_OPEN,
+       LPROC_LL_RELEASE,
+       LPROC_LL_MAP,
+       LPROC_LL_LLSEEK,
+       LPROC_LL_FSYNC,
+       LPROC_LL_READDIR,
+       LPROC_LL_SETATTR,
+       LPROC_LL_TRUNC,
+       LPROC_LL_FLOCK,
+       LPROC_LL_GETATTR,
+       LPROC_LL_CREATE,
+       LPROC_LL_LINK,
+       LPROC_LL_UNLINK,
+       LPROC_LL_SYMLINK,
+       LPROC_LL_MKDIR,
+       LPROC_LL_RMDIR,
+       LPROC_LL_MKNOD,
+       LPROC_LL_RENAME,
+       LPROC_LL_STAFS,
+       LPROC_LL_ALLOC_INODE,
+       LPROC_LL_SETXATTR,
+       LPROC_LL_GETXATTR,
+       LPROC_LL_GETXATTR_HITS,
+       LPROC_LL_LISTXATTR,
+       LPROC_LL_REMOVEXATTR,
+       LPROC_LL_INODE_PERM,
+       LPROC_LL_FILE_OPCODES
+};
 
 /* llite/dir.c */
+struct ll_dir_chain {
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
 extern const struct file_operations ll_dir_operations;
 extern const struct inode_operations ll_dir_inode_operations;
 #ifdef HAVE_DIR_CONTEXT
@@ -829,7 +906,7 @@ int ll_fsync(struct file *file, int data);
 int ll_fsync(struct file *file, struct dentry *dentry, int data);
 #endif
 int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
-int ll_fid2path(struct inode *inode, void *arg);
+int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
 
@@ -875,7 +952,7 @@ void ll_dirty_page_discard_warn(struct page *page, int ioret);
 int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
                  struct super_block *, struct lookup_intent *);
 void lustre_dump_dentry(struct dentry *, int recur);
-int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_obd_statfs(struct inode *inode, void __user *arg);
 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize);
@@ -889,6 +966,27 @@ void ll_finish_md_op_data(struct md_op_data *op_data);
 int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
 char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
 void ll_compute_rootsquash_state(struct ll_sb_info *sbi);
+ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
+                       struct lov_user_md **kbuf);
+
+/* Compute expected user md size when passing in a md from user space */
+static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
+{
+       switch (lum->lmm_magic) {
+       case LOV_USER_MAGIC_V1:
+               return sizeof(struct lov_user_md_v1);
+       case LOV_USER_MAGIC_V3:
+               return sizeof(struct lov_user_md_v3);
+       case LOV_USER_MAGIC_SPECIFIC:
+               if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+                       return -EINVAL;
+
+               return lov_user_md_size(lum->lmm_stripe_count,
+                                       LOV_USER_MAGIC_SPECIFIC);
+       }
+
+       return -EINVAL;
+}
 
 /* llite/llite_nfs.c */
 extern struct export_operations lustre_export_operations;
@@ -953,6 +1051,10 @@ struct vvp_io {
                                  * fault API used bitflags for return code.
                                  */
                                 unsigned int    ft_flags;
+                               /**
+                                * check that flags are from filemap_fault
+                                */
+                               bool            ft_flags_valid;
                         } fault;
                 } fault;
         } u;
@@ -1239,10 +1341,11 @@ struct ll_statahead_info {
        wait_queue_head_t       sai_waitq;      /* stat-ahead wait queue */
        struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
        struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
-       struct list_head        sai_entries;    /* entry list */
-       struct list_head        sai_entries_received;   /* entries returned */
-       struct list_head        sai_entries_stated;     /* entries stated */
-       struct list_head        sai_entries_agl;  /* AGL entries to be sent */
+       struct list_head        sai_interim_entries; /* entries which got async
+                                                     * stat reply, but not
+                                                     * instantiated */
+       struct list_head        sai_entries;    /* completed entries */
+       struct list_head        sai_agls;       /* AGLs to be sent */
        struct list_head        sai_cache[LL_SA_CACHE_SIZE];
        spinlock_t              sai_cache_lock[LL_SA_CACHE_SIZE];
        atomic_t                sai_cache_count; /* entry count in cache */
@@ -1250,7 +1353,8 @@ struct ll_statahead_info {
 
 int do_statahead_enter(struct inode *dir, struct dentry **dentry,
                        int only_unplug);
-void ll_stop_statahead(struct inode *dir, void *key);
+void ll_authorize_statahead(struct inode *dir, void *key);
+void ll_deauthorize_statahead(struct inode *dir, void *key);
 
 static inline int ll_glimpse_size(struct inode *inode)
 {
@@ -1280,25 +1384,29 @@ ll_statahead_mark(struct inode *dir, struct dentry *dentry)
                ldd->lld_sa_generation = sai->sai_generation;
 }
 
-static inline int
-d_need_statahead(struct inode *dir, struct dentry *dentryp)
+static inline bool
+dentry_need_statahead(struct inode *dir, struct dentry *dentry)
 {
        struct ll_inode_info  *lli;
        struct ll_dentry_data *ldd;
 
        if (ll_i2sbi(dir)->ll_sa_max == 0)
-               return -EAGAIN;
+               return false;
 
        lli = ll_i2info(dir);
+
+       /* statahead is not allowed for this dir, there may be three causes:
+        * 1. dir is not opened.
+        * 2. statahead hit ratio is too low.
+        * 3. previous stat started statahead thread failed. */
+       if (!lli->lli_sa_enabled)
+               return false;
+
        /* not the same process, don't statahead */
        if (lli->lli_opendir_pid != current_pid())
-               return -EAGAIN;
+               return false;
 
-       /* statahead has been stopped */
-       if (lli->lli_opendir_key == NULL)
-               return -EAGAIN;
-
-       ldd = ll_d2d(dentryp);
+       ldd = ll_d2d(dentry);
        /*
         * When stats a dentry, the system trigger more than once "revalidate"
         * or "lookup", for "getattr", for "getxattr", and maybe for others.
@@ -1316,25 +1424,21 @@ d_need_statahead(struct inode *dir, struct dentry *dentryp)
         */
        if (ldd && lli->lli_sai &&
            ldd->lld_sa_generation == lli->lli_sai->sai_generation)
-               return -EAGAIN;
+               return false;
 
-       return 1;
+       return true;
 }
 
 static inline int
 ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
 {
-       int ret;
-
-       ret = d_need_statahead(dir, *dentryp);
-       if (ret <= 0)
-               return ret;
+       if (!dentry_need_statahead(dir, *dentryp))
+               return -EAGAIN;
 
        return do_statahead_enter(dir, dentryp, only_unplug);
 }
 
 /* llite ioctl register support rountine */
-#ifdef __KERNEL__
 enum llioc_iter {
         LLIOC_CONT = 0,
         LLIOC_STOP
@@ -1375,7 +1479,6 @@ typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
 void ll_iocontrol_unregister(void *magic);
 
-#endif
 
 /* lclient compat stuff */
 #define cl_inode_info ll_inode_info
@@ -1548,12 +1651,12 @@ static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
 {
        CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
               "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
-              dentry->d_parent, dentry->d_inode, d_count(dentry));
+              dentry->d_parent, dentry->d_inode, ll_d_count(dentry));
 
        spin_lock_nested(&dentry->d_lock,
                         nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
        __d_lustre_invalidate(dentry);
-       if (d_count(dentry) == 0)
+       if (ll_d_count(dentry) == 0)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -1566,23 +1669,21 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
 }
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 53, 0)
 /* Compatibility for old (1.8) compiled userspace quota code */
 struct if_quotactl_18 {
-        __u32                   qc_cmd;
-        __u32                   qc_type;
-        __u32                   qc_id;
-        __u32                   qc_stat;
-        struct obd_dqinfo       qc_dqinfo;
-        struct obd_dqblk        qc_dqblk;
-        char                    obd_type[16];
-        struct obd_uuid         obd_uuid;
+       __u32                   qc_cmd;
+       __u32                   qc_type;
+       __u32                   qc_id;
+       __u32                   qc_stat;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+       char                    obd_type[16];
+       struct obd_uuid         obd_uuid;
 };
 #define LL_IOC_QUOTACTL_18              _IOWR('f', 162, struct if_quotactl_18 *)
 /* End compatibility for old (1.8) compiled userspace quota code */
-#else
-#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
-#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 53, 0) */
 
 enum {
        LL_LAYOUT_GEN_NONE  = ((__u32)-2),      /* layout lock was cancelled */
@@ -1599,4 +1700,6 @@ void ll_xattr_fini(void);
 int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
                    struct cl_page *page, enum cl_req_type crt);
 
+int ll_getparent(struct file *file, struct getparent __user *arg);
+
 #endif /* LLITE_INTERNAL_H */