Whamcloud - gitweb
LU-9091 obdclass: allow bare KMGTPE param suffix
[fs/lustre-release.git] / lustre / obdclass / class_obd.c
index 3470ba6..8543769 100644 (file)
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
@@ -27,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
-#ifndef __KERNEL__
-# include <liblustre.h>
-#else
-# include <asm/atomic.h>
-#endif
+
+#include <linux/miscdevice.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/atomic.h>
+#include <linux/list.h>
 
 #include <obd_support.h>
 #include <obd_class.h>
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnetctl.h>
 #include <lustre_debug.h>
+#include <lustre_kernelcomm.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_build_version.h>
-#include <libcfs/list.h>
+#include <cl_object.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <dt_object.h>
+# include <md_object.h>
+#endif /* HAVE_SERVER_SUPPORT */
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include "llog_internal.h"
 
-#ifndef __KERNEL__
-/* liblustre workaround */
-cfs_atomic_t libcfs_kmemory = {0};
+#ifdef CONFIG_PROC_FS
+static __u64 obd_max_alloc;
+#else
+__u64 obd_max_alloc;
 #endif
 
-struct obd_device *obd_devs[MAX_OBD_DEVICES];
-EXPORT_SYMBOL(obd_devs);
-cfs_list_t obd_types;
-DEFINE_RWLOCK(obd_dev_lock);
-
-__u64 obd_max_pages = 0;
-__u64 obd_max_alloc = 0;
-#ifndef __KERNEL__
-__u64 obd_alloc;
-__u64 obd_pages;
-#endif
-DEFINE_SPINLOCK(obd_updatemax_lock);
+static DEFINE_SPINLOCK(obd_updatemax_lock);
 
 /* The following are visible and mutable through /proc/sys/lustre/. */
-unsigned int obd_alloc_fail_rate = 0;
-EXPORT_SYMBOL(obd_alloc_fail_rate);
 unsigned int obd_debug_peer_on_timeout;
 EXPORT_SYMBOL(obd_debug_peer_on_timeout);
 unsigned int obd_dump_on_timeout;
 EXPORT_SYMBOL(obd_dump_on_timeout);
 unsigned int obd_dump_on_eviction;
 EXPORT_SYMBOL(obd_dump_on_eviction);
-unsigned int obd_max_dirty_pages = 256;
+unsigned int obd_lbug_on_eviction;
+EXPORT_SYMBOL(obd_lbug_on_eviction);
+unsigned long obd_max_dirty_pages;
 EXPORT_SYMBOL(obd_max_dirty_pages);
-cfs_atomic_t obd_dirty_pages;
+atomic_long_t obd_dirty_pages;
 EXPORT_SYMBOL(obd_dirty_pages);
 unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
 EXPORT_SYMBOL(obd_timeout);
@@ -89,6 +81,9 @@ unsigned int obd_timeout_set;
 EXPORT_SYMBOL(obd_timeout_set);
 unsigned int ldlm_timeout_set;
 EXPORT_SYMBOL(ldlm_timeout_set);
+/* bulk transfer timeout, give up after 100s by default */
+unsigned int bulk_timeout = 100; /* seconds */
+EXPORT_SYMBOL(bulk_timeout);
 /* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
 unsigned int at_min = 0;
 EXPORT_SYMBOL(at_min);
@@ -101,105 +96,12 @@ EXPORT_SYMBOL(at_early_margin);
 int at_extra = 30;
 EXPORT_SYMBOL(at_extra);
 
-cfs_atomic_t obd_dirty_transit_pages;
-EXPORT_SYMBOL(obd_dirty_transit_pages);
-
-char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
-EXPORT_SYMBOL(obd_jobid_var);
-
-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * TODO:
- * It's better to cache the jobid for later use if there is any
- * efficient way, the cl_env code probably could be reused for this
- * purpose.
- *
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/api. Then, the jobid must be cached.
- */
-int lustre_get_jobid(char *jobid)
-{
-       int jobid_len = JOBSTATS_JOBID_SIZE;
-       int rc = 0;
-       ENTRY;
-
-       memset(jobid, 0, JOBSTATS_JOBID_SIZE);
-       /* Jobstats isn't enabled */
-       if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
-               RETURN(0);
-
-       /* Use process name + fsuid as jobid */
-       if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
-               snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
-                        cfs_curproc_comm(), cfs_curproc_fsuid());
-               RETURN(0);
-       }
-
-       rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
-       if (rc) {
-               if (rc == -EOVERFLOW) {
-                       /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
-                        * variable length strings instead of just numbers), it
-                        * might make sense to keep the unique parts for JobID,
-                        * instead of just returning an error.  That means a
-                        * larger temp buffer for cfs_get_environ(), then
-                        * truncating the string at some separator to fit into
-                        * the specified jobid_len.  Fix later if needed. */
-                       static bool printed;
-                       if (unlikely(!printed)) {
-                               LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
-                                                  "for JobID buffer (%d)\n",
-                                                  obd_jobid_var, jobid_len);
-                               printed = true;
-                       }
-               } else {
-                       CDEBUG((rc == -ENOENT || rc == -EINVAL ||
-                               rc == -EDEADLK) ? D_INFO : D_ERROR,
-                              "Get jobid for (%s) failed: rc = %d\n",
-                              obd_jobid_var, rc);
-               }
-       }
-       RETURN(rc);
-}
-EXPORT_SYMBOL(lustre_get_jobid);
-
-int obd_alloc_fail(const void *ptr, const char *name, const char *type,
-                  size_t size, const char *file, int line)
-{
-       if (ptr == NULL ||
-           (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
-               CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
-                      ptr ? "force " :"", type, name, (__u64)size, file,
-                      line);
-               CERROR(LPU64" total bytes and "LPU64" total pages "
-                      "("LPU64" bytes) allocated by Lustre, "
-                      "%d total bytes by LNET\n",
-                      obd_memory_sum(),
-                      obd_pages_sum() << CFS_PAGE_SHIFT,
-                      obd_pages_sum(),
-                       cfs_atomic_read(&libcfs_kmemory));
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL(obd_alloc_fail);
-
-static inline void obd_data2conn(struct lustre_handle *conn,
-                                 struct obd_ioctl_data *data)
-{
-        memset(conn, 0, sizeof *conn);
-        conn->cookie = data->ioc_cookie;
-}
-
-static inline void obd_conn2data(struct obd_ioctl_data *data,
-                                 struct lustre_handle *conn)
-{
-        data->ioc_cookie = conn->cookie;
-}
+#ifdef CONFIG_PROC_FS
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+#endif
 
-int class_resolve_dev_name(__u32 len, const char *name)
+static int class_resolve_dev_name(__u32 len, const char *name)
 {
         int rc;
         int dev;
@@ -228,29 +130,173 @@ out:
         RETURN(rc);
 }
 
-int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+#define OBD_MAX_IOCTL_BUFFER   8192
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
 {
-        char *buf = NULL;
-        struct obd_ioctl_data *data;
-        struct libcfs_debug_ioctl_data *debug_data;
-        struct obd_device *obd = NULL;
-        int err = 0, len = 0;
-        ENTRY;
+       if (data->ioc_len > BIT(30)) {
+               CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+               return 1;
+       }
 
-        /* only for debugging */
-        if (cmd == LIBCFS_IOC_DEBUG_MASK) {
-                debug_data = (struct libcfs_debug_ioctl_data*)arg;
-                libcfs_subsystem_debug = debug_data->subs;
-                libcfs_debug = debug_data->debug;
-                return 0;
-        }
+       if (data->ioc_inllen1 > BIT(30)) {
+               CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+               return 1;
+       }
 
-        CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
-        if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
-                CERROR("OBD ioctl: data error\n");
-                RETURN(-EINVAL);
-        }
-        data = (struct obd_ioctl_data *)buf;
+       if (data->ioc_inllen2 > BIT(30)) {
+               CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+               return 1;
+       }
+
+       if (data->ioc_inllen3 > BIT(30)) {
+               CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+               return 1;
+       }
+
+       if (data->ioc_inllen4 > BIT(30)) {
+               CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+               return 1;
+       }
+
+       if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+               CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+               CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+               CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+               CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+               CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+               CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+               return 1;
+       }
+
+       if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+               CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+               return 1;
+       }
+
+       if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+               CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+               return 1;
+       }
+
+       if (obd_ioctl_packlen(data) > data->ioc_len) {
+               CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+                      obd_ioctl_packlen(data), data->ioc_len);
+               return 1;
+       }
+
+       return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
+{
+       struct obd_ioctl_hdr hdr;
+       struct obd_ioctl_data *data;
+       int offset = 0;
+
+       ENTRY;
+       if (copy_from_user(&hdr, arg, sizeof(hdr)))
+               RETURN(-EFAULT);
+
+       if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+               CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+                      OBD_IOCTL_VERSION, hdr.ioc_version);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+               CERROR("User buffer len %d exceeds %d max buffer\n",
+                      hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+               CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       /* When there are lots of processes calling vmalloc on multi-core
+        * system, the high lock contention will hurt performance badly,
+        * obdfilter-survey is an example, which relies on ioctl. So we'd
+        * better avoid vmalloc on ioctl path. LU-66
+        */
+       OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+       if (!*buf) {
+               CERROR("Cannot allocate control buffer of len %d\n",
+                      hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+       *len = hdr.ioc_len;
+       data = (struct obd_ioctl_data *)*buf;
+
+       if (copy_from_user(*buf, arg, hdr.ioc_len)) {
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(-EFAULT);
+       }
+
+       if (obd_ioctl_is_invalid(data)) {
+               CERROR("ioctl not correctly formatted\n");
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       if (data->ioc_inllen1) {
+               data->ioc_inlbuf1 = &data->ioc_bulk[0];
+               offset += cfs_size_round(data->ioc_inllen1);
+       }
+
+       if (data->ioc_inllen2) {
+               data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen2);
+       }
+
+       if (data->ioc_inllen3) {
+               data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen3);
+       }
+
+       if (data->ioc_inllen4)
+               data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+       char *buf = NULL;
+       struct obd_ioctl_data *data;
+       struct obd_device *obd = NULL;
+       int err = 0, len = 0;
+
+       ENTRY;
+       CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+       if (obd_ioctl_getdata(&buf, &len, (void __user *)arg)) {
+               CERROR("OBD ioctl: data error\n");
+               RETURN(-EINVAL);
+       }
+       data = (struct obd_ioctl_data *)buf;
 
         switch (cmd) {
         case OBD_IOC_PROCESS_CFG: {
@@ -263,7 +309,7 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 OBD_ALLOC(lcfg, data->ioc_plen1);
                 if (lcfg == NULL)
                         GOTO(out, err = -ENOMEM);
-                err = cfs_copy_from_user(lcfg, data->ioc_pbuf1,
+               err = copy_from_user(lcfg, data->ioc_pbuf1,
                                          data->ioc_plen1);
                 if (!err)
                         err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
@@ -274,25 +320,34 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 GOTO(out, err);
         }
 
-        case OBD_GET_VERSION:
-                if (!data->ioc_inlbuf1) {
-                        CERROR("No buffer passed in ioctl\n");
-                        GOTO(out, err = -EINVAL);
-                }
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+       case OBD_GET_VERSION: {
+               static bool warned;
 
-                if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
-                        CERROR("ioctl buffer too small to hold version\n");
-                        GOTO(out, err = -EINVAL);
-                }
+               if (!data->ioc_inlbuf1) {
+                       CERROR("No buffer passed in ioctl\n");
+                       GOTO(out, err = -EINVAL);
+               }
 
-                memcpy(data->ioc_bulk, BUILD_VERSION,
-                       strlen(BUILD_VERSION) + 1);
+               if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) {
+                       CERROR("ioctl buffer too small to hold version\n");
+                       GOTO(out, err = -EINVAL);
+               }
 
-                err = obd_ioctl_popdata((void *)arg, data, len);
-                if (err)
-                        err = -EFAULT;
-                GOTO(out, err);
+               if (!warned) {
+                       warned = true;
+                       CWARN("%s: ioctl(OBD_GET_VERSION) is deprecated, "
+                             "use llapi_get_version_string() and/or relink\n",
+                             current->comm);
+               }
+               memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING,
+                      strlen(LUSTRE_VERSION_STRING) + 1);
 
+               if (copy_to_user((void __user *)arg, data, len))
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+#endif
         case OBD_IOC_NAME2DEV: {
                 /* Resolve a device name.  This does not change the
                  * currently selected device.
@@ -305,8 +360,7 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 if (dev < 0)
                         GOTO(out, err = -EINVAL);
 
-                err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
-                if (err)
+               if (copy_to_user((void __user *)arg, data, sizeof(*data)))
                         err = -EFAULT;
                 GOTO(out, err);
         }
@@ -339,18 +393,11 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 
                 CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
                        dev);
-                err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
-                if (err)
+               if (copy_to_user((void __user *)arg, data, sizeof(*data)))
                         err = -EFAULT;
                 GOTO(out, err);
         }
 
-        case OBD_IOC_CLOSE_UUID: {
-                CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
-                       data->ioc_inlbuf1);
-                GOTO(out, err = 0);
-        }
-
         case OBD_IOC_GETDEVICE: {
                 int     index = data->ioc_count;
                 char    *status, *str;
@@ -368,22 +415,27 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 if (!obd)
                         GOTO(out, err = -ENOENT);
 
-                if (obd->obd_stopping)
-                        status = "ST";
-                else if (obd->obd_set_up)
-                        status = "UP";
-                else if (obd->obd_attached)
-                        status = "AT";
-                else
-                        status = "--";
+               if (obd->obd_stopping)
+                       status = "ST";
+               else if (obd->obd_inactive)
+                       status = "IN";
+               else if (obd->obd_set_up)
+                       status = "UP";
+               else if (obd->obd_attached)
+                       status = "AT";
+               else
+                       status = "--";
+
                 str = (char *)data->ioc_bulk;
                 snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
                          (int)index, status, obd->obd_type->typ_name,
                          obd->obd_name, obd->obd_uuid.uuid,
-                         cfs_atomic_read(&obd->obd_refcount));
-                err = obd_ioctl_popdata((void *)arg, data, len);
+                        atomic_read(&obd->obd_refcount));
 
-                GOTO(out, err = 0);
+               if (copy_to_user((void __user *)arg, data, len))
+                       err = -EFAULT;
+
+               GOTO(out, err);
         }
 
         }
@@ -429,286 +481,490 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                 if (err)
                         GOTO(out, err);
 
-                err = obd_ioctl_popdata((void *)arg, data, len);
-                if (err)
+               if (copy_to_user((void __user *)arg, data, len))
                         err = -EFAULT;
                 GOTO(out, err);
         }
         }
 
- out:
-        if (buf)
-                obd_ioctl_freedata(buf, len);
-        RETURN(err);
+out:
+       OBD_FREE_LARGE(buf, len);
+       RETURN(err);
 } /* class_handle_ioctl */
 
-#ifdef __KERNEL__
-extern cfs_psdev_t obd_psdev;
-#else
-void *obd_psdev = NULL;
-#endif
-
-#define OBD_INIT_CHECK
-#ifdef OBD_INIT_CHECK
-int obd_init_checks(void)
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+                           unsigned long arg)
 {
-        __u64 u64val, div64val;
-        char buf[64];
-        int len, ret = 0;
+       int err = 0;
 
-        CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+       ENTRY;
+       /* Allow non-root access for some limited ioctls */
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(err = -EACCES);
 
-        CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+       if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+               RETURN(err = -ENOTTY);
 
-        u64val = OBD_OBJECT_EOF;
-        CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
-        if (u64val != OBD_OBJECT_EOF) {
-                CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
-                       u64val, (int)sizeof(u64val));
-                ret = -EINVAL;
-        }
-        len = snprintf(buf, sizeof(buf), LPX64, u64val);
-        if (len != 18) {
-                CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
-                ret = -EINVAL;
-        }
+       err = class_handle_ioctl(cmd, (unsigned long)arg);
 
-        div64val = OBD_OBJECT_EOF;
-        CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
-        if (u64val != OBD_OBJECT_EOF) {
-                CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
-                       u64val, (int)sizeof(u64val));
-                ret = -EOVERFLOW;
-        }
-        if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
-                CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
-                       u64val, (int)sizeof(u64val));
-                return -EOVERFLOW;
-        }
-        if (do_div(div64val, 256) != (u64val & 255)) {
-                CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
-                return -EOVERFLOW;
-        }
-        if (u64val >> 8 != div64val) {
-                CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
-                       u64val, div64val, u64val >> 8);
-                return -EOVERFLOW;
-        }
-        len = snprintf(buf, sizeof(buf), LPX64, u64val);
-        if (len != 18) {
-                CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
-                ret = -EINVAL;
-        }
-        len = snprintf(buf, sizeof(buf), LPU64, u64val);
-        if (len != 20) {
-                CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
-                ret = -EINVAL;
-        }
-        len = snprintf(buf, sizeof(buf), LPD64, u64val);
-        if (len != 2) {
-                CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
-                ret = -EINVAL;
-        }
-        if ((u64val & ~CFS_PAGE_MASK) >= CFS_PAGE_SIZE) {
-                CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
-                      (__u64)CFS_PAGE_SIZE);
-                ret = -EINVAL;
-        }
-
-        return ret;
+       RETURN(err);
 }
-#else
-#define obd_init_checks() do {} while(0)
-#endif
 
-extern spinlock_t obd_types_lock;
-extern int class_procfs_init(void);
-extern int class_procfs_clean(void);
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+       .owner          = THIS_MODULE,
+       .unlocked_ioctl = obd_class_ioctl,      /* unlocked_ioctl */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+       .minor  = MISC_DYNAMIC_MINOR,
+       .name   = OBD_DEV_NAME,
+       .fops   = &obd_psdev_fops,
+};
+
+#define test_string_to_size_err(value, expect, def_unit, __rc)                \
+({                                                                            \
+       u64 __size;                                                            \
+       int __ret;                                                             \
+                                                                              \
+       BUILD_BUG_ON(sizeof(value) >= 23);                                     \
+       __ret = sysfs_memparse(value, sizeof(value) - 1, &__size, def_unit);   \
+       if (__ret != __rc)                                                     \
+               CERROR("string_helper: parsing '%s' expect rc %d != got %d\n", \
+                      value, __rc, __ret);                                    \
+       else if (!__ret && (u64)expect != __size)                              \
+               CERROR("string_helper: parsing '%s' expect %llu != got %llu\n",\
+                      value, (u64)expect, __size);                            \
+       __ret;                                                                 \
+})
+#define test_string_to_size_one(value, expect, def_unit)                      \
+       test_string_to_size_err(value, expect, def_unit, 0)
+
+static int __init obd_init_checks(void)
+{
+       __u64 u64val, div64val;
+       char buf[64];
+       int len, ret = 0;
+
+       CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF);
+
+       u64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+       if (len != 18) {
+               CERROR("u64 hex wrong length, strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
 
-#ifdef __KERNEL__
-static int __init init_obdclass(void)
-#else
-int init_obdclass(void)
-#endif
+       div64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EOVERFLOW;
+       }
+       if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+               CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EOVERFLOW;
+       }
+       if (do_div(div64val, 256) != (u64val & 255)) {
+               CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255);
+               ret = -EOVERFLOW;
+       }
+       if (u64val >> 8 != div64val) {
+               CERROR("do_div(%#llx,256) %llu != %llu\n",
+                      u64val, div64val, u64val >> 8);
+               ret = -EOVERFLOW;
+       }
+       len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+       if (len != 18) {
+               CERROR("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), "%llu", u64val);
+       if (len != 20) {
+               CERROR("u64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), "%lld", u64val);
+       if (len != 2) {
+               CERROR("s64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+               ret = -EINVAL;
+       }
+       if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) {
+               CERROR("mask failed: u64val %llu >= %llu\n", u64val,
+                      (__u64)PAGE_SIZE);
+               ret = -EINVAL;
+       }
+       if (ret)
+               RETURN(ret);
+
+       /* invalid string */
+       if (!test_string_to_size_err("256B34", 256, "B", -EINVAL)) {
+               CERROR("string_helpers: format should be number then units\n");
+               ret = -EINVAL;
+       }
+       if (!test_string_to_size_err("132OpQ", 132, "B", -EINVAL)) {
+               CERROR("string_helpers: invalid units should be rejected\n");
+               ret = -EINVAL;
+       }
+       if (!test_string_to_size_err("1.82B", 1, "B", -EINVAL)) {
+               CERROR("string_helpers: 'B' with '.' should be invalid\n");
+               ret = -EINVAL;
+       }
+       if (test_string_to_size_one("343\n", 343, "B")) {
+               CERROR("string_helpers: should ignore newline\n");
+               ret = -EINVAL;
+       }
+       if (ret)
+               RETURN(ret);
+
+       /* memparse unit handling */
+       ret = 0;
+       ret += test_string_to_size_one("0B", 0, "B");
+       ret += test_string_to_size_one("512B", 512, "B");
+       ret += test_string_to_size_one("1.067kB", 1067, "B");
+       ret += test_string_to_size_one("1.042KiB", 1067, "B");
+       ret += test_string_to_size_one("8", 8388608, "M");
+       ret += test_string_to_size_one("65536", 65536, "B");
+       ret += test_string_to_size_one("128", 131072, "K");
+       ret += test_string_to_size_one("1M", 1048576, "B");
+       ret += test_string_to_size_one("0.5T", 549755813888ULL, "T");
+       ret += test_string_to_size_one("256.5G", 275414777856ULL, "G");
+       if (ret)
+               RETURN(ret);
+
+       /* string helper values */
+       ret += test_string_to_size_one("16", 16777216, "MiB");
+       ret += test_string_to_size_one("8.39MB", 8390000, "MiB");
+       ret += test_string_to_size_one("8.00MiB", 8388608, "MiB");
+       ret += test_string_to_size_one("256GB", 256000000000ULL, "GiB");
+       ret += test_string_to_size_one("238.731GiB", 256335459385ULL, "GiB");
+       if (ret)
+               RETURN(ret);
+
+       /* huge values */
+       ret += test_string_to_size_one("0.4TB", 400000000000ULL, "TiB");
+       ret += test_string_to_size_one("12.5TiB", 13743895347200ULL, "TiB");
+       ret += test_string_to_size_one("2PB", 2000000000000000ULL, "PiB");
+       ret += test_string_to_size_one("16PiB", 18014398509481984ULL, "PiB");
+       if (ret)
+               RETURN(ret);
+
+       /* huge values should overflow */
+       if (!test_string_to_size_err("1000EiB", 0, "EiB", -EOVERFLOW)) {
+               CERROR("string_helpers: failed to detect binary overflow\n");
+               ret = -EINVAL;
+       }
+       if (!test_string_to_size_err("1000EB", 0, "EiB", -EOVERFLOW)) {
+               CERROR("string_helpers: failed to detect decimal overflow\n");
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static int __init obdclass_init(void)
 {
-        int i, err;
-#ifdef __KERNEL__
-        int lustre_register_fs(void);
+       int err;
 
-        for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
-                CFS_INIT_LIST_HEAD(&capa_list[i]);
-#endif
+       LCONSOLE_INFO("Lustre: Build Version: "LUSTRE_VERSION_STRING"\n");
+
+       libcfs_kkuc_init();
 
-        LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+       err = obd_init_checks();
+       if (err)
+               return err;
 
-       spin_lock_init(&obd_types_lock);
-        obd_zombie_impexp_init();
-#ifdef LPROCFS
-        obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+#ifdef CONFIG_PROC_FS
+       obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
                                         LPROCFS_STATS_FLAG_NONE |
                                         LPROCFS_STATS_FLAG_IRQ_SAFE);
-        if (obd_memory == NULL) {
-                CERROR("kmalloc of 'obd_memory' failed\n");
-                RETURN(-ENOMEM);
-        }
+       if (obd_memory == NULL) {
+               CERROR("kmalloc of 'obd_memory' failed\n");
+               return -ENOMEM;
+       }
 
-        lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
-                             LPROCFS_CNTR_AVGMINMAX,
-                             "memused", "bytes");
-        lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
-                             LPROCFS_CNTR_AVGMINMAX,
-                             "pagesused", "pages");
+       lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+                            LPROCFS_CNTR_AVGMINMAX,
+                            "memused", "bytes");
 #endif
-        err = obd_init_checks();
-        if (err == -EOVERFLOW)
-                return err;
+       err = obd_zombie_impexp_init();
+       if (err)
+               goto cleanup_obd_memory;
 
-        class_init_uuidlist();
-        err = class_handle_init();
-        if (err)
-                return err;
+       err = class_handle_init();
+       if (err)
+               goto cleanup_zombie_impexp;
 
-        CFS_INIT_LIST_HEAD(&obd_types);
+       err = misc_register(&obd_psdev);
+       if (err) {
+               CERROR("cannot register OBD miscdevice: err = %d\n", err);
+               goto cleanup_class_handle;
+       }
 
-        err = cfs_psdev_register(&obd_psdev);
-        if (err) {
-                CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
-                return err;
-        }
+       /* Default the dirty page cache cap to 1/2 of system memory.
+        * For clients with less memory, a larger fraction is needed
+        * for other purposes (mostly for BGL). */
+       if (cfs_totalram_pages() <= 512 << (20 - PAGE_SHIFT))
+               obd_max_dirty_pages = cfs_totalram_pages() / 4;
+       else
+               obd_max_dirty_pages = cfs_totalram_pages() / 2;
 
-        /* This struct is already zeroed for us (static global) */
-        for (i = 0; i < class_devno_max(); i++)
-                obd_devs[i] = NULL;
-
-        /* Default the dirty page cache cap to 1/2 of system memory.
-         * For clients with less memory, a larger fraction is needed
-         * for other purposes (mostly for BGL). */
-        if (cfs_num_physpages <= 512 << (20 - CFS_PAGE_SHIFT))
-                obd_max_dirty_pages = cfs_num_physpages / 4;
-        else
-                obd_max_dirty_pages = cfs_num_physpages / 2;
-
-        err = obd_init_caches();
-        if (err)
-                return err;
-#ifdef __KERNEL__
-        err = class_procfs_init();
-        if (err)
-                return err;
-#endif
+       err = obd_init_caches();
+       if (err)
+               goto cleanup_deregister;
+
+       err = class_procfs_init();
+       if (err)
+               goto cleanup_caches;
+
+       err = lu_global_init();
+       if (err)
+               goto cleanup_class_procfs;
+
+       err = cl_global_init();
+       if (err != 0)
+               goto cleanup_lu_global;
 
-        err = lu_global_init();
-        if (err)
-                return err;
+#ifdef HAVE_SERVER_SUPPORT
+       err = dt_global_init();
+       if (err != 0)
+               goto cleanup_cl_global;
+
+       err = lu_ucred_global_init();
+       if (err != 0)
+               goto cleanup_dt_global;
+#endif /* HAVE_SERVER_SUPPORT */
 
        err = llog_info_init();
        if (err)
-               return err;
+#ifdef HAVE_SERVER_SUPPORT
+               goto cleanup_lu_ucred_global;
+#else /* !HAVE_SERVER_SUPPORT */
+               goto cleanup_cl_global;
+#endif /* HAVE_SERVER_SUPPORT */
+
+       err = lustre_register_fs();
+
+       /* simulate a late OOM situation now to require all
+        * alloc'ed/initialized resources to be freed */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OBDCLASS_MODULE_LOAD)) {
+               /* fake error but filesystem has been registered */
+               lustre_unregister_fs();
+               /* force error to ensure module will be unloaded/cleaned */
+               err = -ENOMEM;
+       }
+
+       if (err)
+               goto cleanup_llog_info;
+
+       return 0;
+
+cleanup_llog_info:
+       llog_info_fini();
+
+#ifdef HAVE_SERVER_SUPPORT
+cleanup_lu_ucred_global:
+       lu_ucred_global_fini();
 
-#ifdef __KERNEL__
-        err = lustre_register_fs();
+cleanup_dt_global:
+       dt_global_fini();
+#endif /* HAVE_SERVER_SUPPORT */
+
+cleanup_cl_global:
+       cl_global_fini();
+
+cleanup_lu_global:
+       lu_global_fini();
+
+cleanup_class_procfs:
+       class_procfs_clean();
+
+cleanup_caches:
+       obd_cleanup_caches();
+
+cleanup_deregister:
+       misc_deregister(&obd_psdev);
+
+cleanup_class_handle:
+       class_handle_cleanup();
+
+cleanup_zombie_impexp:
+       obd_zombie_impexp_stop();
+
+cleanup_obd_memory:
+#ifdef CONFIG_PROC_FS
+       lprocfs_free_stats(&obd_memory);
 #endif
 
-        return err;
+       return err;
 }
 
 void obd_update_maxusage(void)
 {
-       __u64 max1, max2;
+       __u64 max;
 
-       max1 = obd_pages_sum();
-       max2 = obd_memory_sum();
+       max = obd_memory_sum();
 
        spin_lock(&obd_updatemax_lock);
-       if (max1 > obd_max_pages)
-               obd_max_pages = max1;
-       if (max2 > obd_max_alloc)
-               obd_max_alloc = max2;
+       if (max > obd_max_alloc)
+               obd_max_alloc = max;
        spin_unlock(&obd_updatemax_lock);
 }
 EXPORT_SYMBOL(obd_update_maxusage);
 
-#ifdef LPROCFS
+#ifdef CONFIG_PROC_FS
 __u64 obd_memory_max(void)
 {
        __u64 ret;
 
+       obd_update_maxusage();
        spin_lock(&obd_updatemax_lock);
        ret = obd_max_alloc;
        spin_unlock(&obd_updatemax_lock);
 
        return ret;
 }
-EXPORT_SYMBOL(obd_memory_max);
+#endif /* CONFIG_PROC_FS */
 
-__u64 obd_pages_max(void)
+static void __exit obdclass_exit(void)
 {
-       __u64 ret;
+#ifdef CONFIG_PROC_FS
+       __u64 memory_leaked;
+       __u64 memory_max;
+#endif /* CONFIG_PROC_FS */
+       ENTRY;
 
-       spin_lock(&obd_updatemax_lock);
-       ret = obd_max_pages;
-       spin_unlock(&obd_updatemax_lock);
+       lustre_unregister_fs();
 
-       return ret;
+       misc_deregister(&obd_psdev);
+       llog_info_fini();
+#ifdef HAVE_SERVER_SUPPORT
+       lu_ucred_global_fini();
+       dt_global_fini();
+#endif /* HAVE_SERVER_SUPPORT */
+       cl_global_fini();
+       lu_global_fini();
+
+       obd_cleanup_caches();
+
+       class_procfs_clean();
+
+       class_handle_cleanup();
+       class_del_uuid(NULL); /* Delete all UUIDs. */
+       obd_zombie_impexp_stop();
+
+#ifdef CONFIG_PROC_FS
+       memory_leaked = obd_memory_sum();
+       memory_max = obd_memory_max();
+
+       lprocfs_free_stats(&obd_memory);
+       /* the below message is checked in test-framework.sh check_mem_leak() */
+       CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+              "obd_memory max: %llu, leaked: %llu\n",
+              memory_max, memory_leaked);
+#endif /* CONFIG_PROC_FS */
+
+       EXIT;
 }
-EXPORT_SYMBOL(obd_pages_max);
-#endif
 
-/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
- * ifdef to the end of the file to cover module and versioning goo.*/
-#ifdef __KERNEL__
-static void cleanup_obdclass(void)
+void obd_heat_clear(struct obd_heat_instance *instance, int count)
 {
-        int i;
-        int lustre_unregister_fs(void);
-        __u64 memory_leaked, pages_leaked;
-        __u64 memory_max, pages_max;
-        ENTRY;
+       ENTRY;
 
-        lustre_unregister_fs();
+       memset(instance, 0, sizeof(*instance) * count);
+       RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_clear);
 
-        cfs_psdev_deregister(&obd_psdev);
-        for (i = 0; i < class_devno_max(); i++) {
-                struct obd_device *obd = class_num2obd(i);
-                if (obd && obd->obd_set_up &&
-                    OBT(obd) && OBP(obd, detach)) {
-                        /* XXX should this call generic detach otherwise? */
-                        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-                        OBP(obd, detach)(obd);
-                }
-        }
-       llog_info_fini();
-        lu_global_fini();
+/*
+ * The file heat is calculated for every time interval period I. The access
+ * frequency during each period is counted. The file heat is only recalculated
+ * at the end of a time period.  And a percentage of the former file heat is
+ * lost when recalculated. The recursion formula to calculate the heat of the
+ * file f is as follow:
+ *
+ * Hi+1(f) = (1-P)*Hi(f)+ P*Ci
+ *
+ * Where Hi is the heat value in the period between time points i*I and
+ * (i+1)*I; Ci is the access count in the period; the symbol P refers to the
+ * weight of Ci. The larger the value the value of P is, the more influence Ci
+ * has on the file heat.
+ */
+void obd_heat_decay(struct obd_heat_instance *instance,  __u64 time_second,
+                   unsigned int weight, unsigned int period_second)
+{
+       u64 second;
 
-        obd_cleanup_caches();
-        obd_sysctl_clean();
+       ENTRY;
 
-        class_procfs_clean();
+       if (instance->ohi_time_second > time_second) {
+               obd_heat_clear(instance, 1);
+               RETURN_EXIT;
+       }
 
-        class_handle_cleanup();
-        class_exit_uuidlist();
-        obd_zombie_impexp_stop();
+       if (instance->ohi_time_second == 0)
+               RETURN_EXIT;
+
+       for (second = instance->ohi_time_second + period_second;
+            second < time_second;
+            second += period_second) {
+               instance->ohi_heat = instance->ohi_heat *
+                               (256 - weight) / 256 +
+                               instance->ohi_count * weight / 256;
+               instance->ohi_count = 0;
+               instance->ohi_time_second = second;
+       }
+       RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_decay);
 
-        memory_leaked = obd_memory_sum();
-        pages_leaked = obd_pages_sum();
+__u64 obd_heat_get(struct obd_heat_instance *instance, unsigned int time_second,
+                  unsigned int weight, unsigned int period_second)
+{
+       ENTRY;
 
-        memory_max = obd_memory_max();
-        pages_max = obd_pages_max();
+       obd_heat_decay(instance, time_second, weight, period_second);
 
-        lprocfs_free_stats(&obd_memory);
-        CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
-               "obd_memory max: "LPU64", leaked: "LPU64"\n",
-               memory_max, memory_leaked);
-        CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
-               "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
-               pages_max, pages_leaked);
+       if (instance->ohi_count == 0)
+               RETURN(instance->ohi_heat);
 
-        EXIT;
+       RETURN(instance->ohi_heat * (256 - weight) / 256 +
+              instance->ohi_count * weight / 256);
 }
+EXPORT_SYMBOL(obd_heat_get);
 
-MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+void obd_heat_add(struct obd_heat_instance *instance,
+                 unsigned int time_second,  __u64 count,
+                 unsigned int weight, unsigned int period_second)
+{
+       ENTRY;
+
+       obd_heat_decay(instance, time_second, weight, period_second);
+       if (instance->ohi_time_second == 0) {
+               instance->ohi_time_second = time_second;
+               instance->ohi_heat = 0;
+               instance->ohi_count = count;
+       } else {
+               instance->ohi_count += count;
+       }
+       RETURN_EXIT;
+}
+EXPORT_SYMBOL(obd_heat_add);
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
 MODULE_LICENSE("GPL");
 
-cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
-#endif
+module_init(obdclass_init);
+module_exit(obdclass_exit);