lustre/osd-zfs/udmu.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 /*
  31  * Copyright (c) 2012, Intel Corporation.
  32  * Use is subject to license terms.
  33  */
  34 /*
  35  * This file is part of Lustre, http://www.lustre.org/
  36  * Lustre is a trademark of Sun Microsystems, Inc.
  37  *
  38  * lustre/osd-zfs/udmu.c
  39  * Module that interacts with the ZFS DMU and provides an abstraction
  40  * to the rest of Lustre.
  41  *
  42  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
  43  * Author: Atul Vidwansa <atul.vidwansa@sun.com>
  44  * Author: Manoj Joseph <manoj.joseph@sun.com>
  45  * Author: Mike Pershin <tappro@whamcloud.com>
  46  */
  47
  48 #include <sys/dnode.h>
  49 #include <sys/dbuf.h>
  50 #include <sys/spa.h>
  51 #include <sys/stat.h>
  52 #include <sys/zap.h>
  53 #include <sys/spa_impl.h>
  54 #include <sys/zfs_znode.h>
  55 #include <sys/dmu_tx.h>
  56 #include <sys/dmu_objset.h>
  57 #include <sys/dsl_prop.h>
  58 #include <sys/sa_impl.h>
  59 #include <sys/txg.h>
  60
  61 #include <lustre/lustre_idl.h>  /* OBD_OBJECT_EOF */
  62 #include <lustre/lustre_user.h> /* struct obd_statfs */
  63
  64 #include "udmu.h"
  65
  66 int udmu_blk_insert_cost(void)
  67 {
  68         int max_blockshift, nr_blkptrshift;
  69
  70         /* max_blockshift is the log2 of the number of blocks needed to reach
  71          * the maximum filesize (that's to say 2^64) */
  72         max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT;
  73
  74         /* nr_blkptrshift is the log2 of the number of block pointers that can
  75          * be stored in an indirect block */
  76         CLASSERT(DN_MAX_INDBLKSHIFT > SPA_BLKPTRSHIFT);
  77         nr_blkptrshift = DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT;
  78
  79         /* max_blockshift / nr_blkptrshift is thus the maximum depth of the
  80          * tree. We add +1 for rounding purpose.
  81          * The tree depth times the indirect block size gives us the maximum
  82          * cost of inserting a block in the tree */
  83         return (max_blockshift / nr_blkptrshift + 1) * (1 << DN_MAX_INDBLKSHIFT);
  84 }
  85
  86 int udmu_objset_open(char *osname, udmu_objset_t *uos)
  87 {
  88         uint64_t refdbytes, availbytes, usedobjs, availobjs;
  89         uint64_t version = ZPL_VERSION;
  90         uint64_t sa_obj;
  91         int      error;
  92
  93         memset(uos, 0, sizeof(udmu_objset_t));
  94
  95         error = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, uos, &uos->os);
  96         if (error) {
  97                 uos->os = NULL;
  98                 goto out;
  99         }
 100
 101         /* Check ZFS version */
 102         error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
 103                            &version);
 104         if (error) {
 105                 CERROR("%s: Error looking up ZPL VERSION\n", osname);
 106                 /*
 107                  * We can't return ENOENT because that would mean the objset
 108                  * didn't exist.
 109                  */
 110                 error = EIO;
 111                 goto out;
 112         }
 113
 114         error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 115                            &sa_obj);
 116         if (error)
 117                 goto out;
 118
 119         error = sa_setup(uos->os, sa_obj, zfs_attr_table, ZPL_END,
 120                          &uos->z_attr_table);
 121         if (error)
 122                 goto out;
 123
 124         error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 125                            &uos->root);
 126         if (error) {
 127                 CERROR("%s: Error looking up ZFS root object.\n", osname);
 128                 error = EIO;
 129                 goto out;
 130         }
 131         ASSERT(uos->root != 0);
 132
 133         /* Check that user/group usage tracking is supported */
 134         if (!dmu_objset_userused_enabled(uos->os) ||
 135                 DMU_USERUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED ||
 136                 DMU_GROUPUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED) {
 137                 CERROR("%s: Space accounting not supported by this target, "
 138                         "aborting\n", osname);
 139                 error = ENOTSUPP;
 140                 goto out;
 141         }
 142
 143         /*
 144          * as DMU doesn't maintain f_files absolutely actual (it's updated
 145          * at flush, not when object is create/destroyed) we've implemented
 146          * own counter which is initialized from on-disk at mount, then is
 147          * being maintained by DMU OSD
 148          */
 149         dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
 150                          &availobjs);
 151         uos->objects = usedobjs;
 152         spin_lock_init(&uos->lock);
 153
 154 out:
 155         if (error && uos->os != NULL)
 156                 dmu_objset_disown(uos->os, uos);
 157
 158         return error;
 159 }
 160
 161 void udmu_objset_close(udmu_objset_t *uos)
 162 {
 163         ASSERT(uos->os != NULL);
 164
 165         /*
 166          * Force a txg sync.  This should not be needed, neither for
 167          * correctness nor safety.  Presumably, we are only doing
 168          * this to force commit callbacks to be called sooner.
 169          */
 170         txg_wait_synced(dmu_objset_pool(uos->os), 0ULL);
 171
 172         /* close the object set */
 173         dmu_objset_disown(uos->os, uos);
 174
 175         uos->os = NULL;
 176 }
 177
 178 /* Estimate the number of objects from a number of blocks */
 179 static uint64_t udmu_objs_count_estimate(uint64_t refdbytes,
 180                                         uint64_t usedobjs,
 181                                         uint64_t nrblocks)
 182 {
 183         uint64_t est_objs, est_refdblocks, est_usedobjs;
 184
 185         /* Compute an nrblocks estimate based on the actual number of
 186          * dnodes that could fit in the space.  Since we don't know the
 187          * overhead associated with each dnode (xattrs, SAs, VDEV overhead,
 188          * etc) just using DNODE_SHIFT isn't going to give a good estimate.
 189          * Instead, compute an estimate based on the average space usage per
 190          * dnode, with an upper and lower cap.
 191          *
 192          * In case there aren't many dnodes or blocks used yet, add a small
 193          * correction factor using OSD_DNODE_EST_SHIFT.  This correction
 194          * factor gradually disappears as the number of real dnodes grows.
 195          * This also avoids the need to check for divide-by-zero later.
 196          */
 197         CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0);
 198         CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0);
 199
 200         est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) +
 201                 (OSD_DNODE_EST_COUNT << OSD_DNODE_EST_BLKSHIFT);
 202         est_usedobjs   = usedobjs + OSD_DNODE_EST_COUNT;
 203
 204         /* Average space/dnode more than maximum dnode size, use max dnode
 205          * size to estimate free dnodes from adjusted free blocks count.
 206          * OSTs typically use more than one block dnode so this case applies. */
 207         if (est_usedobjs <= est_refdblocks * 2) {
 208                 est_objs = nrblocks;
 209
 210         /* Average space/dnode smaller than min dnode size (probably due to
 211          * metadnode compression), use min dnode size to estimate the number of
 212          * objects.
 213          * An MDT typically uses below 512 bytes/dnode so this case applies. */
 214         } else if (est_usedobjs >= (est_refdblocks << OSD_DNODE_MIN_BLKSHIFT)) {
 215                 est_objs = nrblocks << OSD_DNODE_MIN_BLKSHIFT;
 216
 217                 /* Between the extremes, we try to use the average size of
 218                  * existing dnodes to compute the number of dnodes that fit
 219                  * into nrblocks:
 220                  *
 221                  * est_objs = nrblocks * (est_usedobjs / est_refblocks);
 222                  *
 223                  * but this may overflow 64 bits or become 0 if not handled well
 224                  *
 225                  * We know nrblocks is below (64 - 17 = 47) bits from
 226                  * SPA_MAXBLKSHIFT, and est_usedobjs is under 48 bits due to
 227                  * DN_MAX_OBJECT_SHIFT, which means that multiplying them may
 228                  * get as large as 2 ^ 95.
 229                  *
 230                  * We also know (est_usedobjs / est_refdblocks) is between 2 and
 231                  * 256, due to above checks, so we can safely compute this first.
 232                  * We care more about accuracy on the MDT (many dnodes/block)
 233                  * which is good because this is where truncation errors are
 234                  * smallest.  This adds 8 bits to nrblocks so we can use 7 bits
 235                  * to compute a fixed-point fraction and nrblocks can still fit
 236                  * in 64 bits. */
 237         } else {
 238                 unsigned dnodes_per_block = (est_usedobjs << 7)/est_refdblocks;
 239
 240                 est_objs = (nrblocks * dnodes_per_block) >> 7;
 241         }
 242         return est_objs;
 243 }
 244
 245 int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs)
 246 {
 247         uint64_t refdbytes, availbytes, usedobjs, availobjs;
 248         uint64_t est_availobjs;
 249         uint64_t reserved;
 250
 251         dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
 252                         &availobjs);
 253
 254         /*
 255          * ZFS allows multiple block sizes.  For statfs, Linux makes no
 256          * proper distinction between bsize and frsize.  For calculations
 257          * of free and used blocks incorrectly uses bsize instead of frsize,
 258          * but bsize is also used as the optimal blocksize.  We return the
 259          * largest possible block size as IO size for the optimum performance
 260          * and scale the free and used blocks count appropriately.
 261          */
 262         osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT;
 263
 264         osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT;
 265         osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT;
 266         osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */
 267
 268         /* Take replication (i.e. number of copies) into account */
 269         osfs->os_bavail /= uos->os->os_copies;
 270
 271         /*
 272          * Reserve some space so we don't run into ENOSPC due to grants not
 273          * accounting for metadata overhead in ZFS, and to avoid fragmentation.
 274          * Rather than report this via os_bavail (which makes users unhappy if
 275          * they can't fill the filesystem 100%), reduce os_blocks as well.
 276          *
 277          * Reserve 0.78% of total space, at least 4MB for small filesystems,
 278          * for internal files to be created/unlinked when space is tight.
 279          */
 280         CLASSERT(OSD_STATFS_RESERVED_BLKS > 0);
 281         if (likely(osfs->os_blocks >=
 282                         OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT))
 283                 reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT;
 284         else
 285                 reserved = OSD_STATFS_RESERVED_BLKS;
 286
 287         osfs->os_blocks -= reserved;
 288         osfs->os_bfree  -= MIN(reserved, osfs->os_bfree);
 289         osfs->os_bavail -= MIN(reserved, osfs->os_bavail);
 290
 291         /*
 292          * The availobjs value returned from dmu_objset_space() is largely
 293          * useless, since it reports the number of objects that might
 294          * theoretically still fit into the dataset, independent of minor
 295          * issues like how much space is actually available in the pool.
 296          * Compute a better estimate in udmu_objs_count_estimate().
 297          */
 298         est_availobjs = udmu_objs_count_estimate(refdbytes, usedobjs,
 299                                                 osfs->os_bfree);
 300
 301         osfs->os_ffree = min(availobjs, est_availobjs);
 302         osfs->os_files = osfs->os_ffree + uos->objects;
 303
 304         /* ZFS XXX: fill in backing dataset FSID/UUID
 305            memcpy(osfs->os_fsid, .... );*/
 306
 307         /* We're a zfs filesystem. */
 308         osfs->os_type = UBERBLOCK_MAGIC;
 309
 310         /* ZFS XXX: fill in appropriate OS_STATE_{DEGRADED,READONLY} flags
 311            osfs->os_state = vf_to_stf(vfsp->vfs_flag);
 312            if (sb->s_flags & MS_RDONLY)
 313            osfs->os_state = OS_STATE_READONLY;
 314          */
 315
 316         osfs->os_namelen = MAXNAMELEN;
 317         osfs->os_maxbytes = OBD_OBJECT_EOF;
 318
 319         return 0;
 320 }
 321
 322 /**
 323  * Helper function to estimate the number of inodes in use for a give uid/gid
 324  * from the block usage
 325  */
 326 uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes)
 327 {
 328         uint64_t refdbytes, availbytes, usedobjs, availobjs;
 329         uint64_t uidobjs;
 330
 331         /* get fresh statfs info */
 332         dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
 333                         &availobjs);
 334
 335         /* estimate the number of objects based on the disk usage */
 336         uidobjs = udmu_objs_count_estimate(refdbytes, usedobjs,
 337                                         uidbytes >> SPA_MAXBLOCKSHIFT);
 338         if (uidbytes > 0)
 339                 /* if we have at least 1 byte, we have at least one dnode ... */
 340                 uidobjs = max_t(uint64_t, uidobjs, 1);
 341         return uidobjs;
 342 }
 343
 344 /* Get the objset name.
 345    buf must have at least MAXNAMELEN bytes */
 346 void udmu_objset_name_get(udmu_objset_t *uos, char *buf)
 347 {
 348         dmu_objset_name(uos->os, buf);
 349 }
 350
 351 static int udmu_userprop_setup(udmu_objset_t *uos, const char *prop_name,
 352                 char **os_name, char **real_prop)
 353 {
 354         if (os_name != NULL) {
 355                 *os_name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 356                 udmu_objset_name_get(uos, *os_name);
 357         }
 358
 359         *real_prop = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 360
 361         if (snprintf(*real_prop, MAXNAMELEN, "lustre:%s", prop_name) >=
 362                         MAXNAMELEN) {
 363                 if (os_name != NULL)
 364                         kmem_free(*os_name, MAXNAMELEN);
 365                 kmem_free(*real_prop, MAXNAMELEN);
 366
 367                 CERROR("property name too long: %s\n", prop_name);
 368                 return ENAMETOOLONG;
 369         }
 370
 371         return 0;
 372 }
 373
 374 static void udmu_userprop_cleanup(char **os_name, char **real_prop)
 375 {
 376         if (os_name != NULL)
 377                 kmem_free(*os_name, MAXNAMELEN);
 378         kmem_free(*real_prop, MAXNAMELEN);
 379 }
 380
 381 /* Set ZFS user property 'prop_name' of objset 'uos' to string 'val' */
 382 int udmu_userprop_set_str(udmu_objset_t *uos, const char *prop_name,
 383                 const char *val)
 384 {
 385         char *os_name;
 386         char *real_prop;
 387         int rc;
 388
 389         rc = udmu_userprop_setup(uos, prop_name, &os_name, &real_prop);
 390         if (rc != 0)
 391                 return rc;
 392
 393         rc = dsl_prop_set(os_name, real_prop, ZPROP_SRC_LOCAL, 1,
 394                         strlen(val) + 1, val);
 395         udmu_userprop_cleanup(&os_name, &real_prop);
 396
 397         return rc;
 398 }
 399
 400 /* Get ZFS user property 'prop_name' of objset 'uos' into buffer 'buf' of size
 401    'buf_size' */
 402 int udmu_userprop_get_str(udmu_objset_t *uos, const char *prop_name, char *buf,
 403                                 size_t buf_size)
 404 {
 405         char *real_prop;
 406         char *nvp_val;
 407         size_t nvp_len;
 408         nvlist_t *nvl = NULL;
 409         nvlist_t *nvl_val;
 410         nvpair_t *elem = NULL;
 411         int rc;
 412
 413         rc = udmu_userprop_setup(uos, prop_name, NULL, &real_prop);
 414         if (rc != 0)
 415                 return rc;
 416
 417         /* We can't just pass buf_size to dsl_prop_get() because it expects the
 418            exact value size (zap_lookup() requirement), so we must get all props
 419            and extract the one we want. */
 420         rc = dsl_prop_get_all(uos->os, &nvl);
 421         if (rc != 0) {
 422                 nvl = NULL;
 423                 goto out;
 424         }
 425
 426         while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 427                 const char *name = nvpair_name(elem);
 428                 if (strcmp(name, real_prop) != 0)
 429                         continue;
 430
 431                 /* Got the property we were looking for, but the val is not the
 432                    string yet, it's an nvlist */
 433
 434                 rc = nvpair_value_nvlist(elem, &nvl_val);
 435                 if (rc != 0)
 436                         goto out;
 437
 438                 rc = nvlist_lookup_string(nvl_val, ZPROP_VALUE, &nvp_val);
 439                 if (rc != 0)
 440                         goto out;
 441
 442                 nvp_len = strlen(nvp_val);
 443                 if (buf_size < nvp_len + 1) {
 444                         rc = EOVERFLOW;
 445                         goto out;
 446                 }
 447                 strcpy(buf, nvp_val);
 448                 goto out;
 449         }
 450         /* Not found */
 451         rc = ENOENT;
 452 out:
 453         if (nvl != NULL)
 454                 nvlist_free(nvl);
 455         udmu_userprop_cleanup(NULL, &real_prop);
 456
 457         return rc;
 458 }
 459
 460 /* We don't actually have direct access to the zap_hashbits() function
 461  * so just pretend like we do for now.  If this ever breaks we can look at
 462  * it at that time. */
 463 #define zap_hashbits(zc) 48
 464 /*
 465  * ZFS hash format:
 466  * | cd (16 bits) | hash (48 bits) |
 467  * we need it in other form:
 468  * |0| hash (48 bit) | cd (15 bit) |
 469  * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge
 470  * the readdir hashes from multiple directory stripes uniformly on the client.
 471  * Another point is sign bit, the hash range should be in [0, 2^63-1] because
 472  * loff_t (for llseek) needs to be a positive value.  This means the "cd" field
 473  * should only be the low 15 bits.
 474  */
 475 uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc)
 476 {
 477         uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1);
 478
 479         return (zfs_hash >> zap_hashbits(zc)) |
 480                 (zfs_hash << (63 - zap_hashbits(zc)));
 481 }
 482
 483 void udmu_zap_cursor_init_serialized(zap_cursor_t *zc, udmu_objset_t *uos,
 484                 uint64_t zapobj, uint64_t dirhash)
 485 {
 486         uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) |
 487                 (dirhash >> (63 - zap_hashbits(zc)));
 488         zap_cursor_init_serialized(zc, uos->os, zapobj, zfs_hash);
 489 }
 490
 491 /*
 492  * Zap cursor APIs
 493  */
 494 int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos,
 495                 uint64_t zapobj, uint64_t dirhash)
 496 {
 497         zap_cursor_t *t;
 498
 499         t = kmem_alloc(sizeof(*t), KM_NOSLEEP);
 500         if (t) {
 501                 udmu_zap_cursor_init_serialized(t, uos, zapobj, dirhash);
 502                 *zc = t;
 503                 return 0;
 504         }
 505         return (ENOMEM);
 506 }
 507
 508 void udmu_zap_cursor_fini(zap_cursor_t *zc)
 509 {
 510         zap_cursor_fini(zc);
 511         kmem_free(zc, sizeof(*zc));
 512 }
 513
 514 /*
 515  * Get the object id from dmu_buf_t
 516  */
 517 int udmu_object_is_zap(dmu_buf_t *db)
 518 {
 519         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db;
 520         dnode_t *dn;
 521         int rc;
 522
 523         DB_DNODE_ENTER(dbi);
 524
 525         dn = DB_DNODE(dbi);
 526         rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS ||
 527                         dn->dn_type == DMU_OT_USERGROUP_USED);
 528
 529         DB_DNODE_EXIT(dbi);
 530
 531         return rc;
 532 }
 533