Whamcloud - gitweb
LU-6142 lustre: don't take spinlock to read a 'long'.
[fs/lustre-release.git] / lustre / llite / lproc_llite.c
index e015ba6..f6a7699 100644 (file)
@@ -75,13 +75,11 @@ int llite_tunables_register(void)
                goto free_kobj;
 
        llite_root = debugfs_create_dir("llite", debugfs_lustre_root);
-       if (IS_ERR_OR_NULL(llite_root)) {
-               rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM;
-               llite_root = NULL;
+       return 0;
+
 free_kobj:
-               kobject_put(llite_kobj);
-               llite_kobj = NULL;
-       }
+       kobject_put(llite_kobj);
+       llite_kobj = NULL;
 
        return rc;
 }
@@ -326,13 +324,9 @@ static ssize_t max_read_ahead_mb_show(struct kobject *kobj,
 {
        struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
                                              ll_kset.kobj);
-       unsigned long ra_max_mb;
-
-       spin_lock(&sbi->ll_lock);
-       ra_max_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages);
-       spin_unlock(&sbi->ll_lock);
 
-       return snprintf(buf, PAGE_SIZE, "%lu\n", ra_max_mb);
+       return scnprintf(buf, PAGE_SIZE, "%lu\n",
+                       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
 }
 
 static ssize_t max_read_ahead_mb_store(struct kobject *kobj,
@@ -373,13 +367,9 @@ static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj,
 {
        struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
                                              ll_kset.kobj);
-       unsigned long ra_max_file_mb;
 
-       spin_lock(&sbi->ll_lock);
-       ra_max_file_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file);
-       spin_unlock(&sbi->ll_lock);
-
-       return snprintf(buf, PAGE_SIZE, "%lu\n", ra_max_file_mb);
+       return scnprintf(buf, PAGE_SIZE, "%lu\n",
+                        PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
 }
 
 static ssize_t max_read_ahead_per_file_mb_store(struct kobject *kobj,
@@ -417,13 +407,9 @@ static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj,
 {
        struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
                                              ll_kset.kobj);
-       unsigned long ra_max_whole_mb;
-
-       spin_lock(&sbi->ll_lock);
-       ra_max_whole_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages);
-       spin_unlock(&sbi->ll_lock);
 
-       return snprintf(buf, PAGE_SIZE, "%lu\n", ra_max_whole_mb);
+       return scnprintf(buf, PAGE_SIZE, "%lu\n",
+                        PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages));
 }
 
 static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj,
@@ -467,8 +453,10 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
        long max_cached_mb;
        long unused_mb;
 
+       mutex_lock(&cache->ccc_max_cache_mb_lock);
        max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
        unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+       mutex_unlock(&cache->ccc_max_cache_mb_lock);
        seq_printf(m, "users: %d\n"
                      "max_cached_mb: %ld\n"
                      "used_mb: %ld\n"
@@ -522,9 +510,8 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
        /* Allow enough cache so clients can make well-formed RPCs */
        pages_number = max_t(long, pages_number, PTLRPC_MAX_BRW_PAGES);
 
-       spin_lock(&sbi->ll_lock);
+       mutex_lock(&cache->ccc_max_cache_mb_lock);
        diff = pages_number - cache->ccc_lru_max;
-       spin_unlock(&sbi->ll_lock);
 
        /* easy - add more LRU slots. */
        if (diff >= 0) {
@@ -534,7 +521,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 
        env = cl_env_get(&refcheck);
        if (IS_ERR(env))
-               RETURN(PTR_ERR(env));
+               GOTO(out_unlock, rc = PTR_ERR(env));
 
        diff = -diff;
        while (diff > 0) {
@@ -542,18 +529,21 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 
                /* reduce LRU budget from free slots. */
                do {
-                       long ov, nv, retv;
+                       long lru_left_old, lru_left_new, lru_left_ret;
 
-                       ov = atomic_long_read(&cache->ccc_lru_left);
-                       if (ov == 0)
+                       lru_left_old = atomic_long_read(&cache->ccc_lru_left);
+                       if (lru_left_old == 0)
                                break;
 
-                       nv = ov > diff ? ov - diff : 0;
-                       retv = atomic_long_cmpxchg(&cache->ccc_lru_left,
-                                                  ov, nv);
-                       if (likely(ov == retv)) {
-                               diff -= ov - nv;
-                               nrpages += ov - nv;
+                       lru_left_new = lru_left_old > diff ?
+                                       lru_left_old - diff : 0;
+                       lru_left_ret =
+                               atomic_long_cmpxchg(&cache->ccc_lru_left,
+                                                   lru_left_old,
+                                                   lru_left_new);
+                       if (likely(lru_left_old == lru_left_ret)) {
+                               diff -= lru_left_old - lru_left_new;
+                               nrpages += lru_left_old - lru_left_new;
                                break;
                        }
                } while (1);
@@ -566,8 +556,11 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
                        break;
                }
 
+               /* Request extra free slots to avoid them all being used
+                * by other processes before this can continue shrinking.
+                */
+               tmp = diff + min_t(long, diff, MiB_TO_PAGES(1024));
                /* difficult - have to ask OSCs to drop LRU slots. */
-               tmp = diff << 1;
                rc = obd_set_info_async(env, sbi->ll_dt_exp,
                                sizeof(KEY_CACHE_LRU_SHRINK),
                                KEY_CACHE_LRU_SHRINK,
@@ -579,13 +572,13 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 
 out:
        if (rc >= 0) {
-               spin_lock(&sbi->ll_lock);
                cache->ccc_lru_max = pages_number;
-               spin_unlock(&sbi->ll_lock);
                rc = count;
        } else {
                atomic_long_add(nrpages, &cache->ccc_lru_left);
        }
+out_unlock:
+       mutex_unlock(&cache->ccc_max_cache_mb_lock);
        return rc;
 }
 LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
@@ -1105,18 +1098,21 @@ static ssize_t max_read_ahead_async_active_store(struct kobject *kobj,
        if (rc)
                return rc;
 
-       if (val < 1 || val > WQ_UNBOUND_MAX_ACTIVE) {
-               CERROR("%s: cannot set max_read_ahead_async_active=%u %s than %u\n",
-                      sbi->ll_fsname, val,
-                      val < 1 ? "smaller" : "larger",
-                      val < 1 ? 1 : WQ_UNBOUND_MAX_ACTIVE);
+       /**
+        * It doesn't make any sense to make it exceed what
+        * workqueue could acutally support. This can easily
+        * over subscripe the cores but Lustre internally
+        * throttles to avoid those impacts.
+        */
+       if (val > WQ_UNBOUND_MAX_ACTIVE) {
+               CERROR("%s: cannot set max_read_ahead_async_active=%u larger than %u\n",
+                      sbi->ll_fsname, val, WQ_UNBOUND_MAX_ACTIVE);
                return -ERANGE;
        }
 
        spin_lock(&sbi->ll_lock);
        sbi->ll_ra_info.ra_async_max_active = val;
        spin_unlock(&sbi->ll_lock);
-       workqueue_set_max_active(sbi->ll_ra_info.ll_readahead_wq, val);
 
        return count;
 }
@@ -1455,9 +1451,9 @@ out_free_kernbuff:
        OBD_FREE(kernbuf, count + 1);
        return rc ? rc : count;
 }
-LPROC_SEQ_FOPS(ll_pcc);
+LDEBUGFS_SEQ_FOPS(ll_pcc);
 
-struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
        { .name =       "site",
          .fops =       &ll_site_stats_fops                     },
        { .name =       "max_cached_mb",
@@ -1529,18 +1525,14 @@ static struct kobj_type sbi_ktype = {
        .release        = sbi_kobj_release,
 };
 
-#define LPROCFS_TYPE_LATENCY \
-       (LPROCFS_TYPE_USEC | LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV)
 static const struct llite_file_opcode {
        __u32           opcode;
        __u32           type;
        const char      *opname;
 } llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
        /* file operation */
-       { LPROC_LL_READ_BYTES,  LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES,
-               "read_bytes" },
-       { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES,
-               "write_bytes" },
+       { LPROC_LL_READ_BYTES,  LPROCFS_TYPE_BYTES_FULL, "read_bytes" },
+       { LPROC_LL_WRITE_BYTES, LPROCFS_TYPE_BYTES_FULL, "write_bytes" },
        { LPROC_LL_READ,        LPROCFS_TYPE_LATENCY,   "read" },
        { LPROC_LL_WRITE,       LPROCFS_TYPE_LATENCY,   "write" },
        { LPROC_LL_IOCTL,       LPROCFS_TYPE_REQS,      "ioctl" },
@@ -1557,6 +1549,7 @@ static const struct llite_file_opcode {
        { LPROC_LL_TRUNC,       LPROCFS_TYPE_LATENCY,   "truncate" },
        { LPROC_LL_FLOCK,       LPROCFS_TYPE_LATENCY,   "flock" },
        { LPROC_LL_GETATTR,     LPROCFS_TYPE_LATENCY,   "getattr" },
+       { LPROC_LL_FALLOCATE,   LPROCFS_TYPE_LATENCY, "fallocate"},
        /* dir inode operation */
        { LPROC_LL_CREATE,      LPROCFS_TYPE_LATENCY,   "create" },
        { LPROC_LL_LINK,        LPROCFS_TYPE_LATENCY,   "link" },
@@ -1618,7 +1611,7 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
 {
        struct lustre_sb_info *lsi = s2lsi(sb);
        struct ll_sb_info *sbi = ll_s2sbi(sb);
-       int err, id, rc;
+       int err, id;
 
        ENTRY;
        LASSERT(sbi);
@@ -1626,35 +1619,21 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
        if (IS_ERR_OR_NULL(llite_root))
                goto out_ll_kset;
 
-       sbi->ll_debugfs_entry = ldebugfs_register(name, llite_root,
-                                                 lprocfs_llite_obd_vars, sb);
-       if (IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) {
-               err = sbi->ll_debugfs_entry ? PTR_ERR(sbi->ll_debugfs_entry) :
-                                             -ENOMEM;
-               sbi->ll_debugfs_entry = NULL;
-               RETURN(err);
-       }
+       sbi->ll_debugfs_entry = debugfs_create_dir(name, llite_root);
+       ldebugfs_add_vars(sbi->ll_debugfs_entry, lprocfs_llite_obd_vars, sb);
 
-       rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache",0444,
-                                &vvp_dump_pgcache_file_ops, sbi);
-       if (rc)
-               CWARN("Error adding the dump_page_cache file\n");
+       debugfs_create_file("dump_page_cache", 0444, sbi->ll_debugfs_entry, sbi,
+                           &vvp_dump_pgcache_file_ops);
 
-       rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644,
-                                &ll_rw_extents_stats_fops, sbi);
-       if (rc)
-               CWARN("Error adding the extent_stats file\n");
+       debugfs_create_file("extents_stats", 0644, sbi->ll_debugfs_entry, sbi,
+                                &ll_rw_extents_stats_fops);
 
-       rc = ldebugfs_seq_create(sbi->ll_debugfs_entry,
-                                "extents_stats_per_process", 0644,
-                                &ll_rw_extents_stats_pp_fops, sbi);
-       if (rc)
-               CWARN("Error adding the extents_stats_per_process file\n");
+       debugfs_create_file("extents_stats_per_process", 0644,
+                           sbi->ll_debugfs_entry, sbi,
+                           &ll_rw_extents_stats_pp_fops);
 
-       rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644,
-                                &ll_rw_offset_stats_fops, sbi);
-       if (rc)
-               CWARN("Error adding the offset_stats file\n");
+       debugfs_create_file("offset_stats", 0644, sbi->ll_debugfs_entry, sbi,
+                           &ll_rw_offset_stats_fops);
 
        /* File operations stats */
        sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
@@ -1665,26 +1644,21 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
        /* do counter init */
        for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
                u32 type = llite_opcode_table[id].type;
-               void *ptr = NULL;
+               void *ptr = "unknown";
 
                if (type & LPROCFS_TYPE_REQS)
                        ptr = "reqs";
                else if (type & LPROCFS_TYPE_BYTES)
                        ptr = "bytes";
-               else if (type & LPROCFS_TYPE_PAGES)
-                       ptr = "pages";
                else if (type & LPROCFS_TYPE_USEC)
                        ptr = "usec";
                lprocfs_counter_init(sbi->ll_stats,
-                                    llite_opcode_table[id].opcode,
-                                    (type & LPROCFS_CNTR_AVGMINMAX),
+                                    llite_opcode_table[id].opcode, type,
                                     llite_opcode_table[id].opname, ptr);
        }
 
-       err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats",
-                                     sbi->ll_stats);
-       if (err)
-               GOTO(out_stats, err);
+       debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry,
+                           sbi->ll_stats, &ldebugfs_stats_seq_fops);
 
        sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
                                               LPROCFS_STATS_FLAG_NONE);
@@ -1695,10 +1669,8 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
                lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
                                     ra_stat_string[id], "pages");
 
-       err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats",
-                                     sbi->ll_ra_stats);
-       if (err)
-               GOTO(out_ra_stats, err);
+       debugfs_create_file("read_ahead_stats", 0644, sbi->ll_debugfs_entry,
+                           sbi->ll_ra_stats, &ldebugfs_stats_seq_fops);
 
 out_ll_kset:
        /* Yes we also register sysfs mount kset here as well */
@@ -1769,26 +1741,26 @@ static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                 write_tot += pp_info->pp_w_hist.oh_buckets[i];
         }
 
-        for(i = 0; i < LL_HIST_MAX; i++) {
-                r = pp_info->pp_r_hist.oh_buckets[i];
-                w = pp_info->pp_w_hist.oh_buckets[i];
-                read_cum += r;
-                write_cum += w;
-               end = BIT(i + LL_HIST_START - units);
+       for(i = 0; i < LL_HIST_MAX; i++) {
+               r = pp_info->pp_r_hist.oh_buckets[i];
+               w = pp_info->pp_w_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               end = 1 << (i + LL_HIST_START - units);
                seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u  | "
                           "%14lu %4u %4u\n", start, *unitp, end, *unitp,
-                           (i == LL_HIST_MAX - 1) ? '+' : ' ',
-                           r, pct(r, read_tot), pct(read_cum, read_tot),
-                           w, pct(w, write_tot), pct(write_cum, write_tot));
-                start = end;
-               if (start == BIT(10)) {
-                        start = 1;
-                        units += 10;
-                        unitp++;
-                }
-                if (read_cum == read_tot && write_cum == write_tot)
-                        break;
-        }
+                          (i == LL_HIST_MAX - 1) ? '+' : ' ',
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+               start = end;
+               if (start == (1 << 10)) {
+                       start = 1;
+                       units += 10;
+                       unitp++;
+               }
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
 }
 
 static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
@@ -1949,7 +1921,7 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                 lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
         }
 
-       for (i = 0; (count >= BIT(LL_HIST_START + i)) &&
+       for (i = 0; (count >= 1 << (LL_HIST_START + i)) &&
             (i < (LL_HIST_MAX - 1)); i++);
        if (rw == 0) {
                io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;