lustre/ofd/ofd_dlm.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2015, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/ofd/ofd_dlm.c
  33  *
  34  * This file contains OBD Filter Device (OFD) LDLM-related code which is just
  35  * intent handling for glimpse lock.
  36  *
  37  * Author: Andreas Dilger <andreas.dilger@intel.com>
  38  * Author: Jinshan Xiong <jinshan.xiong@intel.com>
  39  * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
  40  * Author: Mikhail Pershin <mike.pershin@intel.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_FILTER
  44
  45 #include "ofd_internal.h"
  46
  47 struct ofd_intent_args {
  48         struct list_head        gl_list;
  49         __u64                    size;
  50         bool                    no_glimpse_ast;
  51         int                     error;
  52 };
  53
  54 int ofd_dlm_init(void)
  55 {
  56         ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
  57                                              sizeof(struct ldlm_glimpse_work),
  58                                              0, 0, NULL);
  59         if (ldlm_glimpse_work_kmem == NULL)
  60                 return -ENOMEM;
  61         else
  62                 return 0;
  63 }
  64
  65 void ofd_dlm_exit(void)
  66 {
  67         if (ldlm_glimpse_work_kmem) {
  68                 kmem_cache_destroy(ldlm_glimpse_work_kmem);
  69                 ldlm_glimpse_work_kmem = NULL;
  70         }
  71 }
  72
  73 /**
  74  * OFD interval callback.
  75  *
  76  * The interval_callback_t is part of interval_iterate_reverse() and is called
  77  * for each interval in tree. The OFD interval callback searches for locks
  78  * covering extents beyond the given args->size. This is used to decide if the
  79  * size is too small and needs to be updated.  Note that we are only interested
  80  * in growing the size, as truncate is the only operation which can shrink it,
  81  * and it is handled differently.  This is why we only look at locks beyond the
  82  * current size.
  83  *
  84  * It finds the highest lock (by starting point) in this interval, and adds it
  85  * to the list of locks to glimpse.  We must glimpse a list of locks - rather
  86  * than only the highest lock on the file - because lockahead creates extent
  87  * locks in advance of IO, and so breaks the assumption that the holder of the
  88  * highest lock knows the current file size.
  89  *
  90  * This assumption is normally true because locks which are created as part of
  91  * IO - rather than in advance of it - are guaranteed to be 'active', i.e.,
  92  * involved in IO, and the holder of the highest 'active' lock always knows the
  93  * current file size, because the size is either not changing or the holder of
  94  * that lock is responsible for updating it.
  95  *
  96  * So we need only glimpse until we find the first client with an 'active'
  97  * lock.
  98  *
  99  * Unfortunately, there is no way to know if a manually requested/speculative
 100  * lock is 'active' from the server side.  So when we see a potentially
 101  * speculative lock, we must send a glimpse for that lock unless we have
 102  * already sent a glimpse to the holder of that lock.
 103  *
 104  * However, *all* non-speculative locks are active.  So we can stop glimpsing
 105  * as soon as we find a non-speculative lock.  Currently, all speculative PW
 106  * locks have LDLM_FL_NO_EXPANSION set, and we use this to identify them.  This
 107  * is enforced by an assertion in osc_lock_init, which references this comment.
 108  *
 109  * If that ever changes, we will either need to find a new way to identify
 110  * active locks or we will need to consider all PW locks (we will still only
 111  * glimpse one per client).
 112  *
 113  * Note that it is safe to glimpse only the 'top' lock from each interval
 114  * because ofd_intent_cb is only called for PW extent locks, and for PW locks,
 115  * there is only one lock per interval.
 116  *
 117  * \param[in] n         interval node
 118  * \param[in,out] args  intent arguments, gl work list for identified locks
 119  *
 120  * \retval              INTERVAL_ITER_STOP if the interval is lower than
 121  *                      file size, caller stops execution
 122  * \retval              INTERVAL_ITER_CONT if callback finished successfully
 123  *                      and caller may continue execution
 124  */
 125 static enum interval_iter ofd_intent_cb(struct interval_node *n, void *args)
 126 {
 127         struct ldlm_interval     *node = (struct ldlm_interval *)n;
 128         struct ofd_intent_args   *arg = args;
 129         __u64                     size = arg->size;
 130         struct ldlm_lock         *victim_lock = NULL;
 131         struct ldlm_lock         *lck;
 132         struct ldlm_glimpse_work *gl_work = NULL;
 133         int rc = 0;
 134
 135         /* If the interval is lower than the current file size, just break. */
 136         if (interval_high(n) <= size)
 137                 GOTO(out, rc = INTERVAL_ITER_STOP);
 138
 139         /* Find the 'victim' lock from this interval */
 140         list_for_each_entry(lck, &node->li_group, l_sl_policy) {
 141
 142                 victim_lock = LDLM_LOCK_GET(lck);
 143
 144                 /* the same policy group - every lock has the
 145                  * same extent, so needn't do it any more */
 146                 break;
 147         }
 148
 149         /* l_export can be null in race with eviction - In that case, we will
 150          * not find any locks in this interval */
 151         if (!victim_lock)
 152                 GOTO(out, rc = INTERVAL_ITER_CONT);
 153
 154         /*
 155          * This check is for lock taken in ofd_destroy_by_fid() that does
 156          * not have l_glimpse_ast set. So the logic is: if there is a lock
 157          * with no l_glimpse_ast set, this object is being destroyed already.
 158          * Hence, if you are grabbing DLM locks on the server, always set
 159          * non-NULL glimpse_ast (e.g., ldlm_request.c::ldlm_glimpse_ast()).
 160          */
 161         if (victim_lock->l_glimpse_ast == NULL) {
 162                 LDLM_DEBUG(victim_lock, "no l_glimpse_ast");
 163                 arg->no_glimpse_ast = true;
 164                 GOTO(out_release, rc = INTERVAL_ITER_STOP);
 165         }
 166
 167         /* If NO_EXPANSION is not set, this is an active lock, and we don't need
 168          * to glimpse any further once we've glimpsed the client holding this
 169          * lock.  So set us up to stop.  See comment above this function. */
 170         if (!(victim_lock->l_flags & LDLM_FL_NO_EXPANSION))
 171                 rc = INTERVAL_ITER_STOP;
 172         else
 173                 rc = INTERVAL_ITER_CONT;
 174
 175         /* Check to see if we're already set up to send a glimpse to this
 176          * client; if so, don't add this lock to the glimpse list - We need
 177          * only glimpse each client once. (And if we know that client holds
 178          * an active lock, we can stop glimpsing.  So keep the rc set in the
 179          * check above.) */
 180         list_for_each_entry(gl_work, &arg->gl_list, gl_list) {
 181                 if (gl_work->gl_lock->l_export == victim_lock->l_export)
 182                         GOTO(out_release, rc);
 183         }
 184
 185         if (!OBD_FAIL_CHECK(OBD_FAIL_OST_GL_WORK_ALLOC))
 186                 OBD_SLAB_ALLOC_PTR_GFP(gl_work, ldlm_glimpse_work_kmem,
 187                                        GFP_ATOMIC);
 188
 189         if (!gl_work) {
 190                 arg->error = -ENOMEM;
 191                 GOTO(out_release, rc = INTERVAL_ITER_STOP);
 192         }
 193
 194         /* Populate the gl_work structure. */
 195         gl_work->gl_lock = victim_lock;
 196         list_add_tail(&gl_work->gl_list, &arg->gl_list);
 197         /* There is actually no need for a glimpse descriptor when glimpsing
 198          * extent locks */
 199         gl_work->gl_desc = NULL;
 200         /* This tells ldlm_work_gl_ast_lock this was allocated from a slab and
 201          * must be freed in a slab-aware manner. */
 202         gl_work->gl_flags = LDLM_GL_WORK_SLAB_ALLOCATED;
 203
 204         GOTO(out, rc);
 205
 206 out_release:
 207         /* If the victim doesn't go on the glimpse list, we must release it */
 208         LDLM_LOCK_RELEASE(victim_lock);
 209
 210 out:
 211         return rc;
 212 }
 213 /**
 214  * OFD lock intent policy
 215  *
 216  * This defines ldlm_namespace::ns_policy interface for OFD.
 217  * Intent policy is called when lock has an intent, for OFD that
 218  * means glimpse lock and policy fills Lock Value Block (LVB).
 219  *
 220  * If already granted lock is found it will be placed in \a lockp and
 221  * returned back to caller function.
 222  *
 223  * \param[in] ns         namespace
 224  * \param[in,out] lockp  pointer to the lock
 225  * \param[in] req_cookie incoming request
 226  * \param[in] mode       LDLM mode
 227  * \param[in] flags      LDLM flags
 228  * \param[in] data       opaque data, not used in OFD policy
 229  *
 230  * \retval              ELDLM_LOCK_REPLACED if already granted lock was found
 231  *                      and placed in \a lockp
 232  * \retval              ELDLM_LOCK_ABORTED in other cases except error
 233  * \retval              negative errno on error
 234  */
 235 int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
 236                       void *req_cookie, enum ldlm_mode mode, __u64 flags,
 237                       void *data)
 238 {
 239         struct ptlrpc_request *req = req_cookie;
 240         struct ldlm_lock *lock = *lockp;
 241         struct ldlm_resource *res = lock->l_resource;
 242         ldlm_processing_policy policy;
 243         struct ost_lvb *res_lvb, *reply_lvb;
 244         struct ldlm_reply *rep;
 245         enum ldlm_error err;
 246         int idx, rc;
 247         struct ldlm_interval_tree *tree;
 248         struct ofd_intent_args arg;
 249         __u32 repsize[3] = {
 250                 [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
 251                 [DLM_LOCKREPLY_OFF]   = sizeof(*rep),
 252                 [DLM_REPLY_REC_OFF]   = sizeof(*reply_lvb)
 253         };
 254         struct ldlm_glimpse_work *pos, *tmp;
 255         ENTRY;
 256
 257         INIT_LIST_HEAD(&arg.gl_list);
 258         arg.no_glimpse_ast = false;
 259         arg.error = 0;
 260         lock->l_lvb_type = LVB_T_OST;
 261         policy = ldlm_get_processing_policy(res);
 262         LASSERT(policy != NULL);
 263         LASSERT(req != NULL);
 264
 265         rc = lustre_pack_reply(req, 3, repsize, NULL);
 266         if (rc)
 267                 RETURN(req->rq_status = rc);
 268
 269         rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, sizeof(*rep));
 270         LASSERT(rep != NULL);
 271
 272         reply_lvb = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF,
 273                                    sizeof(*reply_lvb));
 274         LASSERT(reply_lvb != NULL);
 275
 276         /* Call the extent policy function to see if our request can be
 277          * granted, or is blocked.
 278          * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse
 279          * lock, and should not be granted if the lock will be blocked.
 280          */
 281
 282         if (flags & LDLM_FL_BLOCK_NOWAIT) {
 283                 OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_AGL_DELAY, 5);
 284
 285                 if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_AGL_NOLOCK))
 286                         RETURN(ELDLM_LOCK_ABORTED);
 287         }
 288
 289         LASSERT(ns == ldlm_res_to_ns(res));
 290         lock_res(res);
 291
 292         /* Check if this is a resend case (MSG_RESENT is set on RPC) and a
 293          * lock was found by ldlm_handle_enqueue(); if so no need to grant
 294          * it again. */
 295         if (flags & LDLM_FL_RESENT) {
 296                 rc = LDLM_ITER_CONTINUE;
 297         } else {
 298                 __u64 tmpflags = 0;
 299                 rc = policy(lock, &tmpflags, LDLM_PROCESS_RESCAN, &err, NULL);
 300                 check_res_locked(res);
 301         }
 302
 303         /* The lock met with no resistance; we're finished. */
 304         if (rc == LDLM_ITER_CONTINUE) {
 305                 if (OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2)) {
 306                         ldlm_resource_unlink_lock(lock);
 307                         err = ELDLM_LOCK_ABORTED;
 308                 } else {
 309                         err = ELDLM_LOCK_REPLACED;
 310                 }
 311                 unlock_res(res);
 312                 RETURN(err);
 313         } else if (flags & LDLM_FL_BLOCK_NOWAIT) {
 314                 /* LDLM_FL_BLOCK_NOWAIT means it is for AGL. Do not send glimpse
 315                  * callback for glimpse size. The real size user will trigger
 316                  * the glimpse callback when necessary. */
 317                 unlock_res(res);
 318                 RETURN(ELDLM_LOCK_ABORTED);
 319         }
 320
 321         /* Do not grant any lock, but instead send GL callbacks.  The extent
 322          * policy nicely created a list of all PW locks for us.  We will choose
 323          * the highest of those which are larger than the size in the LVB, if
 324          * any, and perform a glimpse callback. */
 325         res_lvb = res->lr_lvb_data;
 326         LASSERT(res_lvb != NULL);
 327         *reply_lvb = *res_lvb;
 328
 329         /*
 330          * ->ns_lock guarantees that no new locks are granted, and,
 331          *  therefore, that res->lr_lvb_data cannot increase beyond the
 332          *  end of already granted lock. As a result, it is safe to
 333          *  check against "stale" reply_lvb->lvb_size value without
 334          *  res->lr_lvb_sem.
 335          */
 336         arg.size = reply_lvb->lvb_size;
 337
 338         /* Check for PW locks beyond the size in the LVB, build the list
 339          * of locks to glimpse (arg.gl_list) */
 340         for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 341                 tree = &res->lr_itree[idx];
 342                 if (tree->lit_mode == LCK_PR)
 343                         continue;
 344
 345                 interval_iterate_reverse(tree->lit_root, ofd_intent_cb, &arg);
 346                 if (arg.error) {
 347                         unlock_res(res);
 348                         GOTO(out, rc = arg.error);
 349                 }
 350         }
 351         unlock_res(res);
 352
 353         /* There were no PW locks beyond the size in the LVB; finished. */
 354         if (list_empty(&arg.gl_list))
 355                 RETURN(ELDLM_LOCK_ABORTED);
 356
 357         if (arg.no_glimpse_ast) {
 358                 /* We are racing with unlink(); just return -ENOENT */
 359                 rep->lock_policy_res1 = ptlrpc_status_hton(-ENOENT);
 360                 GOTO(out, ELDLM_LOCK_ABORTED);
 361         }
 362
 363         /* this will update the LVB */
 364         ldlm_glimpse_locks(res, &arg.gl_list);
 365
 366         lock_res(res);
 367         *reply_lvb = *res_lvb;
 368         unlock_res(res);
 369
 370 out:
 371         /* If the list is not empty, we failed to glimpse some locks and
 372          * must clean up.  Usually due to a race with unlink.*/
 373         list_for_each_entry_safe(pos, tmp, &arg.gl_list, gl_list) {
 374                 list_del(&pos->gl_list);
 375                 LDLM_LOCK_RELEASE(pos->gl_lock);
 376                 OBD_SLAB_FREE_PTR(pos, ldlm_glimpse_work_kmem);
 377         }
 378
 379         RETURN(rc < 0 ? rc : ELDLM_LOCK_ABORTED);
 380 }
 381