lustre/ldlm/ldlm_extent.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *
   8  *   This file is part of the Lustre file system, http://www.lustre.org
   9  *   Lustre is a trademark of Cluster File Systems, Inc.
  10  *
  11  *   You may have signed or agreed to another license before downloading
  12  *   this software.  If so, you are bound by the terms and conditions
  13  *   of that agreement, and the following does not apply to you.  See the
  14  *   LICENSE file included with this distribution for more information.
  15  *
  16  *   If you did not agree to a different license, then this copy of Lustre
  17  *   is open source software; you can redistribute it and/or modify it
  18  *   under the terms of version 2 of the GNU General Public License as
  19  *   published by the Free Software Foundation.
  20  *
  21  *   In either case, Lustre is distributed in the hope that it will be
  22  *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  23  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24  *   license text for more details.
  25  */
  26
  27 #define DEBUG_SUBSYSTEM S_LDLM
  28 #ifndef __KERNEL__
  29 # include <liblustre.h>
  30 #else
  31 # include <libcfs/libcfs.h>
  32 # include <libcfs/kp30.h>
  33 #endif
  34
  35 #include <lustre_dlm.h>
  36 #include <obd_support.h>
  37 #include <obd.h>
  38 #include <lustre_lib.h>
  39
  40 #include "ldlm_internal.h"
  41
  42 #define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
  43
  44 /* fixup the ldlm_extent after expanding */
  45 static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
  46                                               struct ldlm_extent *new_ex,
  47                                               int conflicting)
  48 {
  49         ldlm_mode_t req_mode = req->l_req_mode;
  50         __u64 req_start = req->l_req_extent.start;
  51         __u64 req_end = req->l_req_extent.end;
  52         __u64 req_align, mask;
  53
  54         if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
  55                 if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
  56                         new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
  57                                           new_ex->end);
  58         }
  59
  60         if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
  61                 EXIT;
  62                 return;
  63         }
  64
  65         /* we need to ensure that the lock extent is properly aligned to what
  66          * the client requested.  We align it to the lowest-common denominator
  67          * of the clients requested lock start and end alignment. */
  68         mask = 0x1000ULL;
  69         req_align = (req_end + 1) | req_start;
  70         if (req_align != 0) {
  71                 while ((req_align & mask) == 0)
  72                         mask <<= 1;
  73         }
  74         mask -= 1;
  75         /* We can only shrink the lock, not grow it.
  76          * This should never cause lock to be smaller than requested,
  77          * since requested lock was already aligned on these boundaries. */
  78         new_ex->start = ((new_ex->start - 1) | mask) + 1;
  79         new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
  80         LASSERTF(new_ex->start <= req_start,
  81                  "mask "LPX64" grant start "LPU64" req start "LPU64"\n",
  82                  mask, new_ex->start, req_start);
  83         LASSERTF(new_ex->end >= req_end,
  84                  "mask "LPX64" grant end "LPU64" req end "LPU64"\n",
  85                  mask, new_ex->end, req_end);
  86 }
  87
  88 /* The purpose of this function is to return:
  89  * - the maximum extent
  90  * - containing the requested extent
  91  * - and not overlapping existing conflicting extents outside the requested one
  92  *
  93  * Use interval tree to expand the lock extent for granted lock.
  94  */
  95 static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req,
  96                                                 struct ldlm_extent *new_ex)
  97 {
  98         struct ldlm_resource *res = req->l_resource;
  99         ldlm_mode_t req_mode = req->l_req_mode;
 100         __u64 req_start = req->l_req_extent.start;
 101         __u64 req_end = req->l_req_extent.end;
 102         struct ldlm_interval_tree *tree;
 103         struct interval_node_extent limiter = { new_ex->start, new_ex->end };
 104         int conflicting = 0;
 105         int idx;
 106         ENTRY;
 107
 108         lockmode_verify(req_mode);
 109
 110         /* using interval tree to handle the ldlm extent granted locks */
 111         for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 112                 struct interval_node_extent ext = { req_start, req_end };
 113
 114                 tree = &res->lr_itree[idx];
 115                 if (lockmode_compat(tree->lit_mode, req_mode))
 116                         continue;
 117
 118                 conflicting += tree->lit_size;
 119                 if (conflicting > 4)
 120                         limiter.start = req_start;
 121
 122                 if (interval_is_overlapped(tree->lit_root, &ext))
 123                         printk("req_mode = %d, tree->lit_mode = %d, tree->lit_size = %d\n",
 124                                req_mode, tree->lit_mode, tree->lit_size);
 125                 interval_expand(tree->lit_root, &ext, &limiter);
 126                 limiter.start = max(limiter.start, ext.start);
 127                 limiter.end = min(limiter.end, ext.end);
 128                 if (limiter.start == req_start && limiter.end == req_end)
 129                         break;
 130         }
 131
 132         new_ex->start = limiter.start;
 133         new_ex->end = limiter.end;
 134         LASSERT(new_ex->start <= req_start);
 135         LASSERT(new_ex->end >= req_end);
 136
 137         ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
 138         EXIT;
 139 }
 140
 141 /* The purpose of this function is to return:
 142  * - the maximum extent
 143  * - containing the requested extent
 144  * - and not overlapping existing conflicting extents outside the requested one
 145  */
 146 static void
 147 ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
 148                                     struct ldlm_extent *new_ex)
 149 {
 150         struct list_head *tmp;
 151         struct ldlm_resource *res = req->l_resource;
 152         ldlm_mode_t req_mode = req->l_req_mode;
 153         __u64 req_start = req->l_req_extent.start;
 154         __u64 req_end = req->l_req_extent.end;
 155         int conflicting = 0;
 156         ENTRY;
 157
 158         lockmode_verify(req_mode);
 159
 160         /* for waiting locks */
 161         list_for_each(tmp, &res->lr_waiting) {
 162                 struct ldlm_lock *lock;
 163                 struct ldlm_extent *l_extent;
 164
 165                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 166                 l_extent = &lock->l_policy_data.l_extent;
 167
 168                 /* We already hit the minimum requested size, search no more */
 169                 if (new_ex->start == req_start && new_ex->end == req_end) {
 170                         EXIT;
 171                         return;
 172                 }
 173
 174                 /* Don't conflict with ourselves */
 175                 if (req == lock)
 176                         continue;
 177
 178                 /* Locks are compatible, overlap doesn't matter */
 179                 /* Until bug 20 is fixed, try to avoid granting overlapping
 180                  * locks on one client (they take a long time to cancel) */
 181                 if (lockmode_compat(lock->l_req_mode, req_mode) &&
 182                     lock->l_export != req->l_export)
 183                         continue;
 184
 185                 /* If this is a high-traffic lock, don't grow downwards at all
 186                  * or grow upwards too much */
 187                 ++conflicting;
 188                 if (conflicting > 4)
 189                         new_ex->start = req_start;
 190
 191                 /* If lock doesn't overlap new_ex, skip it. */
 192                 if (!ldlm_extent_overlap(l_extent, new_ex))
 193                         continue;
 194
 195                 /* Locks conflicting in requested extents and we can't satisfy
 196                  * both locks, so ignore it.  Either we will ping-pong this
 197                  * extent (we would regardless of what extent we granted) or
 198                  * lock is unused and it shouldn't limit our extent growth. */
 199                 if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent))
 200                         continue;
 201
 202                 /* We grow extents downwards only as far as they don't overlap
 203                  * with already-granted locks, on the assumtion that clients
 204                  * will be writing beyond the initial requested end and would
 205                  * then need to enqueue a new lock beyond previous request.
 206                  * l_req_extent->end strictly < req_start, checked above. */
 207                 if (l_extent->start < req_start && new_ex->start != req_start) {
 208                         if (l_extent->end >= req_start)
 209                                 new_ex->start = req_start;
 210                         else
 211                                 new_ex->start = min(l_extent->end+1, req_start);
 212                 }
 213
 214                 /* If we need to cancel this lock anyways because our request
 215                  * overlaps the granted lock, we grow up to its requested
 216                  * extent start instead of limiting this extent, assuming that
 217                  * clients are writing forwards and the lock had over grown
 218                  * its extent downwards before we enqueued our request. */
 219                 if (l_extent->end > req_end) {
 220                         if (l_extent->start <= req_end)
 221                                 new_ex->end = max(lock->l_req_extent.start - 1,
 222                                                   req_end);
 223                         else
 224                                 new_ex->end = max(l_extent->start - 1, req_end);
 225                 }
 226         }
 227
 228         ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
 229         EXIT;
 230 }
 231
 232
 233 /* In order to determine the largest possible extent we can grant, we need
 234  * to scan all of the queues. */
 235 static void ldlm_extent_policy(struct ldlm_resource *res,
 236                                struct ldlm_lock *lock, int *flags)
 237 {
 238         struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
 239
 240         if (lock->l_export == NULL)
 241                 /*
 242                  * this is local lock taken by server (e.g., as a part of
 243                  * OST-side locking, or unlink handling). Expansion doesn't
 244                  * make a lot of sense for local locks, because they are
 245                  * dropped immediately on operation completion and would only
 246                  * conflict with other threads.
 247                  */
 248                 return;
 249
 250         if (lock->l_policy_data.l_extent.start == 0 &&
 251             lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
 252                 /* fast-path whole file locks */
 253                 return;
 254
 255         ldlm_extent_internal_policy_granted(lock, &new_ex);
 256         ldlm_extent_internal_policy_waiting(lock, &new_ex);
 257
 258         if (new_ex.start != lock->l_policy_data.l_extent.start ||
 259             new_ex.end != lock->l_policy_data.l_extent.end) {
 260                 *flags |= LDLM_FL_LOCK_CHANGED;
 261                 lock->l_policy_data.l_extent.start = new_ex.start;
 262                 lock->l_policy_data.l_extent.end = new_ex.end;
 263         }
 264 }
 265
 266 static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 267 {
 268         struct ldlm_resource *res = lock->l_resource;
 269         cfs_time_t now = cfs_time_current();
 270
 271         CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
 272         if (contended_locks > res->lr_namespace->ns_contended_locks)
 273                 res->lr_contention_time = now;
 274         return cfs_time_before(now, cfs_time_add(res->lr_contention_time,
 275                 cfs_time_seconds(res->lr_namespace->ns_contention_time)));
 276 }
 277
 278 struct ldlm_extent_compat_args {
 279         struct list_head *work_list;
 280         struct ldlm_lock *lock;
 281         ldlm_mode_t mode;
 282         int *locks;
 283         int *compat;
 284 };
 285
 286 static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n,
 287                                                 void *data)
 288 {
 289         struct ldlm_extent_compat_args *priv = data;
 290         struct ldlm_interval *node = to_ldlm_interval(n);
 291         struct ldlm_extent *extent;
 292         struct list_head *work_list = priv->work_list;
 293         struct ldlm_lock *lock, *enq = priv->lock;
 294         ldlm_mode_t mode = priv->mode;
 295         int count = 0;
 296         ENTRY;
 297
 298         LASSERT(!list_empty(&node->li_group));
 299
 300         list_for_each_entry(lock, &node->li_group, l_sl_policy) {
 301                 /* interval tree is for granted lock */
 302                 LASSERTF(mode == lock->l_granted_mode,
 303                          "mode = %s, lock->l_granted_mode = %s\n",
 304                          ldlm_lockname[mode],
 305                          ldlm_lockname[lock->l_granted_mode]);
 306                 count++;
 307                 if (lock->l_blocking_ast)
 308                         ldlm_add_ast_work_item(lock, enq, work_list);
 309         }
 310
 311         /* don't count conflicting glimpse locks */
 312         extent = ldlm_interval_extent(node);
 313         if (!(mode == LCK_PR &&
 314             extent->start == 0 && extent->end == OBD_OBJECT_EOF))
 315                 *priv->locks += count;
 316
 317         if (priv->compat)
 318                 *priv->compat = 0;
 319
 320         RETURN(INTERVAL_ITER_CONT);
 321 }
 322
 323 /* Determine if the lock is compatible with all locks on the queue.
 324  * We stop walking the queue if we hit ourselves so we don't take
 325  * conflicting locks enqueued after us into accound, or we'd wait forever.
 326  *
 327  * 0 if the lock is not compatible
 328  * 1 if the lock is compatible
 329  * 2 if this group lock is compatible and requires no further checking
 330  * negative error, such as EWOULDBLOCK for group locks
 331  */
 332 static int
 333 ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 334                          int *flags, ldlm_error_t *err,
 335                          struct list_head *work_list, int *contended_locks)
 336 {
 337         struct list_head *tmp;
 338         struct ldlm_lock *lock;
 339         struct ldlm_resource *res = req->l_resource;
 340         ldlm_mode_t req_mode = req->l_req_mode;
 341         __u64 req_start = req->l_req_extent.start;
 342         __u64 req_end = req->l_req_extent.end;
 343         int compat = 1;
 344         int scan = 0;
 345         int check_contention;
 346         ENTRY;
 347
 348         lockmode_verify(req_mode);
 349
 350         /* Using interval tree for granted lock */
 351         if (queue == &res->lr_granted) {
 352                 struct ldlm_interval_tree *tree;
 353                 struct ldlm_extent_compat_args data = {.work_list = work_list,
 354                                                .lock = req,
 355                                                .locks = contended_locks,
 356                                                .compat = &compat };
 357                 struct interval_node_extent ex = { .start = req_start,
 358                                                    .end = req_end };
 359                 int idx, rc;
 360
 361                 for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 362                         tree = &res->lr_itree[idx];
 363                         if (tree->lit_root == NULL) /* empty tree, skipped */
 364                                 continue;
 365
 366                         data.mode = tree->lit_mode;
 367                         if (lockmode_compat(req_mode, tree->lit_mode)) {
 368                                 struct ldlm_interval *node;
 369                                 struct ldlm_extent *extent;
 370
 371                                 if (req_mode != LCK_GROUP)
 372                                         continue;
 373
 374                                 /* group lock, grant it immediately if
 375                                  * compatible */
 376                                 node = to_ldlm_interval(tree->lit_root);
 377                                 extent = ldlm_interval_extent(node);
 378                                 if (req->l_policy_data.l_extent.gid ==
 379                                     extent->gid)
 380                                         RETURN(2);
 381                         }
 382
 383                         if (tree->lit_mode == LCK_GROUP) {
 384                                 if (*flags & LDLM_FL_BLOCK_NOWAIT) {
 385                                         compat = -EWOULDBLOCK;
 386                                         goto destroylock;
 387                                 }
 388
 389                                 *flags |= LDLM_FL_NO_TIMEOUT;
 390                                 if (!work_list)
 391                                         RETURN(0);
 392
 393                                 /* if work list is not NULL,add all
 394                                    locks in the tree to work list */
 395                                 compat = 0;
 396                                 interval_iterate(tree->lit_root,
 397                                                  ldlm_extent_compat_cb, &data);
 398                                 continue;
 399                         }
 400
 401                         if (!work_list) {
 402                                 rc = interval_is_overlapped(tree->lit_root,&ex);
 403                                 if (rc)
 404                                         RETURN(0);
 405                         } else {
 406                                 interval_search(tree->lit_root, &ex,
 407                                                 ldlm_extent_compat_cb, &data);
 408                                 if (!list_empty(work_list) && compat)
 409                                         compat = 0;
 410                         }
 411                 }
 412         } else { /* for waiting queue */
 413                 list_for_each(tmp, queue) {
 414                         check_contention = 1;
 415
 416                         lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 417
 418                         if (req == lock)
 419                                 break;
 420
 421                         if (unlikely(scan)) {
 422                                 /* We only get here if we are queuing GROUP lock
 423                                    and met some incompatible one. The main idea of this
 424                                    code is to insert GROUP lock past compatible GROUP
 425                                    lock in the waiting queue or if there is not any,
 426                                    then in front of first non-GROUP lock */
 427                                 if (lock->l_req_mode != LCK_GROUP) {
 428                                         /* Ok, we hit non-GROUP lock, there should
 429                                          * be no more GROUP locks later on, queue in
 430                                          * front of first non-GROUP lock */
 431
 432                                         ldlm_resource_insert_lock_after(lock, req);
 433                                         list_del_init(&lock->l_res_link);
 434                                         ldlm_resource_insert_lock_after(req, lock);
 435                                         compat = 0;
 436                                         break;
 437                                 }
 438                                 if (req->l_policy_data.l_extent.gid ==
 439                                     lock->l_policy_data.l_extent.gid) {
 440                                         /* found it */
 441                                         ldlm_resource_insert_lock_after(lock, req);
 442                                         compat = 0;
 443                                         break;
 444                                 }
 445                                 continue;
 446                         }
 447
 448                         /* locks are compatible, overlap doesn't matter */
 449                         if (lockmode_compat(lock->l_req_mode, req_mode)) {
 450                                 if (req_mode == LCK_PR &&
 451                                     ((lock->l_policy_data.l_extent.start <=
 452                                       req->l_policy_data.l_extent.start) &&
 453                                      (lock->l_policy_data.l_extent.end >=
 454                                       req->l_policy_data.l_extent.end))) {
 455                                         /* If we met a PR lock just like us or wider,
 456                                            and nobody down the list conflicted with
 457                                            it, that means we can skip processing of
 458                                            the rest of the list and safely place
 459                                            ourselves at the end of the list, or grant
 460                                            (dependent if we met an conflicting locks
 461                                            before in the list).
 462                                            In case of 1st enqueue only we continue
 463                                            traversing if there is something conflicting
 464                                            down the list because we need to make sure
 465                                            that something is marked as AST_SENT as well,
 466                                            in cse of empy worklist we would exit on
 467                                            first conflict met. */
 468                                         /* There IS a case where such flag is
 469                                            not set for a lock, yet it blocks
 470                                            something. Luckily for us this is
 471                                            only during destroy, so lock is
 472                                            exclusive. So here we are safe */
 473                                         if (!(lock->l_flags & LDLM_FL_AST_SENT)) {
 474                                                 RETURN(compat);
 475                                         }
 476                                 }
 477
 478                                 /* non-group locks are compatible, overlap doesn't
 479                                    matter */
 480                                 if (likely(req_mode != LCK_GROUP))
 481                                         continue;
 482
 483                                 /* If we are trying to get a GROUP lock and there is
 484                                    another one of this kind, we need to compare gid */
 485                                 if (req->l_policy_data.l_extent.gid ==
 486                                     lock->l_policy_data.l_extent.gid) {
 487                                         /* If existing lock with matched gid is granted,
 488                                            we grant new one too. */
 489                                         if (lock->l_req_mode == lock->l_granted_mode)
 490                                                 RETURN(2);
 491
 492                                         /* Otherwise we are scanning queue of waiting
 493                                          * locks and it means current request would
 494                                          * block along with existing lock (that is
 495                                          * already blocked.
 496                                          * If we are in nonblocking mode - return
 497                                          * immediately */
 498                                         if (*flags & LDLM_FL_BLOCK_NOWAIT) {
 499                                                 compat = -EWOULDBLOCK;
 500                                                 goto destroylock;
 501                                         }
 502                                         /* If this group lock is compatible with another
 503                                          * group lock on the waiting list, they must be
 504                                          * together in the list, so they can be granted
 505                                          * at the same time.  Otherwise the later lock
 506                                          * can get stuck behind another, incompatible,
 507                                          * lock. */
 508                                         ldlm_resource_insert_lock_after(lock, req);
 509                                         /* Because 'lock' is not granted, we can stop
 510                                          * processing this queue and return immediately.
 511                                          * There is no need to check the rest of the
 512                                          * list. */
 513                                         RETURN(0);
 514                                 }
 515                         }
 516
 517                         if (unlikely(req_mode == LCK_GROUP &&
 518                                      (lock->l_req_mode != lock->l_granted_mode))) {
 519                                 scan = 1;
 520                                 compat = 0;
 521                                 if (lock->l_req_mode != LCK_GROUP) {
 522                                         /* Ok, we hit non-GROUP lock, there should be no
 523                                            more GROUP locks later on, queue in front of
 524                                            first non-GROUP lock */
 525
 526                                         ldlm_resource_insert_lock_after(lock, req);
 527                                         list_del_init(&lock->l_res_link);
 528                                         ldlm_resource_insert_lock_after(req, lock);
 529                                         break;
 530                                 }
 531                                 if (req->l_policy_data.l_extent.gid ==
 532                                     lock->l_policy_data.l_extent.gid) {
 533                                         /* found it */
 534                                         ldlm_resource_insert_lock_after(lock, req);
 535                                         break;
 536                                 }
 537                                 continue;
 538                         }
 539
 540                         if (unlikely(lock->l_req_mode == LCK_GROUP)) {
 541                                 /* If compared lock is GROUP, then requested is PR/PW/
 542                                  * so this is not compatible; extent range does not
 543                                  * matter */
 544                                 if (*flags & LDLM_FL_BLOCK_NOWAIT) {
 545                                         compat = -EWOULDBLOCK;
 546                                         goto destroylock;
 547                                 } else {
 548                                         *flags |= LDLM_FL_NO_TIMEOUT;
 549                                 }
 550                         } else if (lock->l_policy_data.l_extent.end < req_start ||
 551                                    lock->l_policy_data.l_extent.start > req_end) {
 552                                 /* if a non group lock doesn't overlap skip it */
 553                                 continue;
 554                         } else if (lock->l_req_extent.end < req_start ||
 555                                    lock->l_req_extent.start > req_end) {
 556                                 /* false contention, the requests doesn't really overlap */
 557                                 check_contention = 0;
 558                         }
 559
 560                         if (!work_list)
 561                                 RETURN(0);
 562
 563                         /* don't count conflicting glimpse locks */
 564                         if (lock->l_req_mode == LCK_PR &&
 565                             lock->l_policy_data.l_extent.start == 0 &&
 566                             lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
 567                                 check_contention = 0;
 568
 569                         *contended_locks += check_contention;
 570
 571                         compat = 0;
 572                         if (lock->l_blocking_ast)
 573                                 ldlm_add_ast_work_item(lock, req, work_list);
 574                 }
 575         }
 576
 577         if (ldlm_check_contention(req, *contended_locks) &&
 578             compat == 0 &&
 579             (*flags & LDLM_FL_DENY_ON_CONTENTION) &&
 580             req->l_req_mode != LCK_GROUP &&
 581             req_end - req_start <=
 582             req->l_resource->lr_namespace->ns_max_nolock_size)
 583                 GOTO(destroylock, compat = -EUSERS);
 584
 585         RETURN(compat);
 586 destroylock:
 587         list_del_init(&req->l_res_link);
 588         ldlm_lock_destroy_nolock(req);
 589         *err = compat;
 590         RETURN(compat);
 591 }
 592
 593 static void discard_bl_list(struct list_head *bl_list)
 594 {
 595         struct list_head *tmp, *pos;
 596         ENTRY;
 597
 598         list_for_each_safe(pos, tmp, bl_list) {
 599                 struct ldlm_lock *lock =
 600                         list_entry(pos, struct ldlm_lock, l_bl_ast);
 601
 602                 list_del_init(&lock->l_bl_ast);
 603                 LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
 604                 lock->l_flags &= ~LDLM_FL_AST_SENT;
 605                 LASSERT(lock->l_bl_ast_run == 0);
 606                 LASSERT(lock->l_blocking_lock);
 607                 LDLM_LOCK_PUT(lock->l_blocking_lock);
 608                 lock->l_blocking_lock = NULL;
 609                 LDLM_LOCK_PUT(lock);
 610         }
 611         EXIT;
 612 }
 613
 614 /* If first_enq is 0 (ie, called from ldlm_reprocess_queue):
 615   *   - blocking ASTs have already been sent
 616   *   - must call this function with the ns lock held
 617   *
 618   * If first_enq is 1 (ie, called from ldlm_lock_enqueue):
 619   *   - blocking ASTs have not been sent
 620   *   - must call this function with the ns lock held once */
 621 int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
 622                              ldlm_error_t *err, struct list_head *work_list)
 623 {
 624         struct ldlm_resource *res = lock->l_resource;
 625         CFS_LIST_HEAD(rpc_list);
 626         int rc, rc2;
 627         int contended_locks = 0;
 628         ENTRY;
 629
 630         LASSERT(list_empty(&res->lr_converting));
 631         LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
 632                 !(lock->l_flags & LDLM_AST_DISCARD_DATA));
 633         check_res_locked(res);
 634         *err = ELDLM_OK;
 635
 636         if (!first_enq) {
 637                 /* Careful observers will note that we don't handle -EWOULDBLOCK
 638                  * here, but it's ok for a non-obvious reason -- compat_queue
 639                  * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT).
 640                  * flags should always be zero here, and if that ever stops
 641                  * being true, we want to find out. */
 642                 LASSERT(*flags == 0);
 643                 rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
 644                                               err, NULL, &contended_locks);
 645                 if (rc == 1) {
 646                         rc = ldlm_extent_compat_queue(&res->lr_waiting, lock,
 647                                                       flags, err, NULL,
 648                                                       &contended_locks);
 649                 }
 650                 if (rc == 0)
 651                         RETURN(LDLM_ITER_STOP);
 652
 653                 ldlm_resource_unlink_lock(lock);
 654
 655                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
 656                         ldlm_extent_policy(res, lock, flags);
 657                 ldlm_grant_lock(lock, work_list);
 658                 RETURN(LDLM_ITER_CONTINUE);
 659         }
 660
 661  restart:
 662         contended_locks = 0;
 663         rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err,
 664                                       &rpc_list, &contended_locks);
 665         if (rc < 0)
 666                 GOTO(out, rc); /* lock was destroyed */
 667         if (rc == 2)
 668                 goto grant;
 669
 670         rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock, flags, err,
 671                                        &rpc_list, &contended_locks);
 672         if (rc2 < 0)
 673                 GOTO(out, rc = rc2); /* lock was destroyed */
 674
 675         if (rc + rc2 == 2) {
 676         grant:
 677                 ldlm_extent_policy(res, lock, flags);
 678                 ldlm_resource_unlink_lock(lock);
 679                 ldlm_grant_lock(lock, NULL);
 680         } else {
 681                 /* If either of the compat_queue()s returned failure, then we
 682                  * have ASTs to send and must go onto the waiting list.
 683                  *
 684                  * bug 2322: we used to unlink and re-add here, which was a
 685                  * terrible folly -- if we goto restart, we could get
 686                  * re-ordered!  Causes deadlock, because ASTs aren't sent! */
 687                 if (list_empty(&lock->l_res_link))
 688                         ldlm_resource_add_lock(res, &res->lr_waiting, lock);
 689                 unlock_res(res);
 690                 rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST);
 691                 lock_res(res);
 692
 693                 if (rc == -ERESTART) {
 694                         /* lock was granted while resource was unlocked. */
 695                         if (lock->l_granted_mode == lock->l_req_mode) {
 696                                 /* bug 11300: if the lock has been granted,
 697                                  * break earlier because otherwise, we will go
 698                                  * to restart and ldlm_resource_unlink will be
 699                                  * called and it causes the interval node to be
 700                                  * freed. Then we will fail at
 701                                  * ldlm_extent_add_lock() */
 702                                 *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV |
 703                                             LDLM_FL_BLOCK_WAIT);
 704                                 GOTO(out, rc = 0);
 705                         }
 706
 707                         GOTO(restart, -ERESTART);
 708                 }
 709
 710                 *flags |= LDLM_FL_BLOCK_GRANTED;
 711                 /* this way we force client to wait for the lock
 712                  * endlessly once the lock is enqueued -bzzz */
 713                 *flags |= LDLM_FL_NO_TIMEOUT;
 714
 715         }
 716         RETURN(0);
 717 out:
 718         if (!list_empty(&rpc_list)) {
 719                 LASSERT(!(lock->l_flags & LDLM_AST_DISCARD_DATA));
 720                 discard_bl_list(&rpc_list);
 721         }
 722         RETURN(rc);
 723 }
 724
 725 /* When a lock is cancelled by a client, the KMS may undergo change if this
 726  * is the "highest lock".  This function returns the new KMS value.
 727  * Caller must hold ns_lock already.
 728  *
 729  * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
 730 __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
 731 {
 732         struct ldlm_resource *res = lock->l_resource;
 733         struct list_head *tmp;
 734         struct ldlm_lock *lck;
 735         __u64 kms = 0;
 736         ENTRY;
 737
 738         /* don't let another thread in ldlm_extent_shift_kms race in
 739          * just after we finish and take our lock into account in its
 740          * calculation of the kms */
 741         lock->l_flags |= LDLM_FL_KMS_IGNORE;
 742
 743         list_for_each(tmp, &res->lr_granted) {
 744                 lck = list_entry(tmp, struct ldlm_lock, l_res_link);
 745
 746                 if (lck->l_flags & LDLM_FL_KMS_IGNORE)
 747                         continue;
 748
 749                 if (lck->l_policy_data.l_extent.end >= old_kms)
 750                         RETURN(old_kms);
 751
 752                 /* This extent _has_ to be smaller than old_kms (checked above)
 753                  * so kms can only ever be smaller or the same as old_kms. */
 754                 if (lck->l_policy_data.l_extent.end + 1 > kms)
 755                         kms = lck->l_policy_data.l_extent.end + 1;
 756         }
 757         LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms);
 758
 759         RETURN(kms);
 760 }
 761
 762 cfs_mem_cache_t *ldlm_interval_slab;
 763 struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
 764 {
 765         struct ldlm_interval *node;
 766         ENTRY;
 767
 768         LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
 769         OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node));
 770         if (node == NULL)
 771                 RETURN(NULL);
 772
 773         CFS_INIT_LIST_HEAD(&node->li_group);
 774         ldlm_interval_attach(node, lock);
 775         RETURN(node);
 776 }
 777
 778 void ldlm_interval_free(struct ldlm_interval *node)
 779 {
 780         if (node) {
 781                 LASSERT(list_empty(&node->li_group));
 782                 OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
 783         }
 784 }
 785
 786 /* interval tree, for LDLM_EXTENT. */
 787 void ldlm_interval_attach(struct ldlm_interval *n,
 788                           struct ldlm_lock *l)
 789 {
 790         LASSERT(l->l_tree_node == NULL);
 791         LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
 792
 793         list_add_tail(&l->l_sl_policy, &n->li_group);
 794         l->l_tree_node = n;
 795 }
 796
 797 struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
 798 {
 799         struct ldlm_interval *n = l->l_tree_node;
 800
 801         if (n == NULL)
 802                 return NULL;
 803
 804         LASSERT(!list_empty(&n->li_group));
 805         l->l_tree_node = NULL;
 806         list_del_init(&l->l_sl_policy);
 807
 808         return (list_empty(&n->li_group) ? n : NULL);
 809 }
 810
 811 static inline int lock_mode_to_index(ldlm_mode_t mode)
 812 {
 813         int index;
 814
 815         LASSERT(mode != 0);
 816         LASSERT(IS_PO2(mode));
 817         for (index = -1; mode; index++, mode >>= 1) ;
 818         LASSERT(index < LCK_MODE_NUM);
 819         return index;
 820 }
 821
 822 void ldlm_extent_add_lock(struct ldlm_resource *res,
 823                           struct ldlm_lock *lock)
 824 {
 825         struct interval_node *found, **root;
 826         struct ldlm_interval *node;
 827         struct ldlm_extent *extent;
 828         int idx;
 829
 830         LASSERT(lock->l_granted_mode == lock->l_req_mode);
 831
 832         node = lock->l_tree_node;
 833         LASSERT(node != NULL);
 834
 835         idx = lock_mode_to_index(lock->l_granted_mode);
 836         LASSERT(lock->l_granted_mode == 1 << idx);
 837         LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
 838
 839         /* node extent initialize */
 840         extent = &lock->l_policy_data.l_extent;
 841         interval_set(&node->li_node, extent->start, extent->end);
 842
 843         root = &res->lr_itree[idx].lit_root;
 844         found = interval_insert(&node->li_node, root);
 845         if (found) { /* The policy group found. */
 846                 struct ldlm_interval *tmp = ldlm_interval_detach(lock);
 847                 LASSERT(tmp != NULL);
 848                 ldlm_interval_free(tmp);
 849                 ldlm_interval_attach(to_ldlm_interval(found), lock);
 850         }
 851         res->lr_itree[idx].lit_size++;
 852
 853         /* even though we use interval tree to manage the extent lock, we also
 854          * add the locks into grant list, for debug purpose, .. */
 855         ldlm_resource_add_lock(res, &res->lr_granted, lock);
 856 }
 857
 858 void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
 859 {
 860         struct ldlm_resource *res = lock->l_resource;
 861         struct ldlm_interval *node;
 862         struct ldlm_interval_tree *tree;
 863         int idx;
 864
 865         if (lock->l_granted_mode != lock->l_req_mode)
 866                 return;
 867
 868         LASSERT(lock->l_tree_node != NULL);
 869         idx = lock_mode_to_index(lock->l_granted_mode);
 870         LASSERT(lock->l_granted_mode == 1 << idx);
 871         tree = &res->lr_itree[idx];
 872
 873         LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
 874
 875         tree->lit_size--;
 876         node = ldlm_interval_detach(lock);
 877         if (node) {
 878                 interval_erase(&node->li_node, &tree->lit_root);
 879                 ldlm_interval_free(node);
 880         }
 881 }