lustre/ptlrpc/sec_bulk.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  30  * Use is subject to license terms.
  31  *
  32  * Copyright (c) 2011, Whamcloud, Inc.
  33  */
  34 /*
  35  * This file is part of Lustre, http://www.lustre.org/
  36  * Lustre is a trademark of Sun Microsystems, Inc.
  37  *
  38  * lustre/ptlrpc/sec_bulk.c
  39  *
  40  * Author: Eric Mei <ericm@clusterfs.com>
  41  */
  42
  43 #ifndef EXPORT_SYMTAB
  44 #define EXPORT_SYMTAB
  45 #endif
  46 #define DEBUG_SUBSYSTEM S_SEC
  47
  48 #include <libcfs/libcfs.h>
  49 #ifndef __KERNEL__
  50 #include <liblustre.h>
  51 #include <libcfs/list.h>
  52 #else
  53 #include <linux/crypto.h>
  54 #endif
  55
  56 #include <obd.h>
  57 #include <obd_cksum.h>
  58 #include <obd_class.h>
  59 #include <obd_support.h>
  60 #include <lustre_net.h>
  61 #include <lustre_import.h>
  62 #include <lustre_dlm.h>
  63 #include <lustre_sec.h>
  64
  65 #include "ptlrpc_internal.h"
  66
  67 /****************************************
  68  * bulk encryption page pools           *
  69  ****************************************/
  70
  71 #ifdef __KERNEL__
  72
  73 #define PTRS_PER_PAGE   (CFS_PAGE_SIZE / sizeof(void *))
  74 #define PAGES_PER_POOL  (PTRS_PER_PAGE)
  75
  76 #define IDLE_IDX_MAX            (100)
  77 #define IDLE_IDX_WEIGHT         (3)
  78
  79 #define CACHE_QUIESCENT_PERIOD  (20)
  80
  81 static struct ptlrpc_enc_page_pool {
  82         /*
  83          * constants
  84          */
  85         unsigned long    epp_max_pages;   /* maximum pages can hold, const */
  86         unsigned int     epp_max_pools;   /* number of pools, const */
  87
  88         /*
  89          * wait queue in case of not enough free pages.
  90          */
  91         cfs_waitq_t      epp_waitq;       /* waiting threads */
  92         unsigned int     epp_waitqlen;    /* wait queue length */
  93         unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
  94         unsigned int     epp_growing:1;   /* during adding pages */
  95
  96         /*
  97          * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
  98          * this is counted based on each time when getting pages from
  99          * the pools, not based on time. which means in case that system
 100          * is idled for a while but the idle_idx might still be low if no
 101          * activities happened in the pools.
 102          */
 103         unsigned long    epp_idle_idx;
 104
 105         /* last shrink time due to mem tight */
 106         long             epp_last_shrink;
 107         long             epp_last_access;
 108
 109         /*
 110          * in-pool pages bookkeeping
 111          */
 112         cfs_spinlock_t   epp_lock;        /* protect following fields */
 113         unsigned long    epp_total_pages; /* total pages in pools */
 114         unsigned long    epp_free_pages;  /* current pages available */
 115
 116         /*
 117          * statistics
 118          */
 119         unsigned long    epp_st_max_pages;      /* # of pages ever reached */
 120         unsigned int     epp_st_grows;          /* # of grows */
 121         unsigned int     epp_st_grow_fails;     /* # of add pages failures */
 122         unsigned int     epp_st_shrinks;        /* # of shrinks */
 123         unsigned long    epp_st_access;         /* # of access */
 124         unsigned long    epp_st_missings;       /* # of cache missing */
 125         unsigned long    epp_st_lowfree;        /* lowest free pages reached */
 126         unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
 127         cfs_time_t       epp_st_max_wait;       /* in jeffies */
 128         /*
 129          * pointers to pools
 130          */
 131         cfs_page_t    ***epp_pools;
 132 } page_pools;
 133
 134 /*
 135  * memory shrinker
 136  */
 137 const int pools_shrinker_seeks = CFS_DEFAULT_SEEKS;
 138 static struct cfs_shrinker *pools_shrinker = NULL;
 139
 140
 141 /*
 142  * /proc/fs/lustre/sptlrpc/encrypt_page_pools
 143  */
 144 int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count,
 145                                int *eof, void *data)
 146 {
 147         int     rc;
 148
 149         cfs_spin_lock(&page_pools.epp_lock);
 150
 151         rc = snprintf(page, count,
 152                       "physical pages:          %lu\n"
 153                       "pages per pool:          %lu\n"
 154                       "max pages:               %lu\n"
 155                       "max pools:               %u\n"
 156                       "total pages:             %lu\n"
 157                       "total free:              %lu\n"
 158                       "idle index:              %lu/100\n"
 159                       "last shrink:             %lds\n"
 160                       "last access:             %lds\n"
 161                       "max pages reached:       %lu\n"
 162                       "grows:                   %u\n"
 163                       "grows failure:           %u\n"
 164                       "shrinks:                 %u\n"
 165                       "cache access:            %lu\n"
 166                       "cache missing:           %lu\n"
 167                       "low free mark:           %lu\n"
 168                       "max waitqueue depth:     %u\n"
 169                       "max wait time:           "CFS_TIME_T"/%u\n"
 170                       ,
 171                       cfs_num_physpages,
 172                       PAGES_PER_POOL,
 173                       page_pools.epp_max_pages,
 174                       page_pools.epp_max_pools,
 175                       page_pools.epp_total_pages,
 176                       page_pools.epp_free_pages,
 177                       page_pools.epp_idle_idx,
 178                       cfs_time_current_sec() - page_pools.epp_last_shrink,
 179                       cfs_time_current_sec() - page_pools.epp_last_access,
 180                       page_pools.epp_st_max_pages,
 181                       page_pools.epp_st_grows,
 182                       page_pools.epp_st_grow_fails,
 183                       page_pools.epp_st_shrinks,
 184                       page_pools.epp_st_access,
 185                       page_pools.epp_st_missings,
 186                       page_pools.epp_st_lowfree,
 187                       page_pools.epp_st_max_wqlen,
 188                       page_pools.epp_st_max_wait, CFS_HZ
 189                      );
 190
 191         cfs_spin_unlock(&page_pools.epp_lock);
 192         return rc;
 193 }
 194
 195 static void enc_pools_release_free_pages(long npages)
 196 {
 197         int     p_idx, g_idx;
 198         int     p_idx_max1, p_idx_max2;
 199
 200         LASSERT(npages > 0);
 201         LASSERT(npages <= page_pools.epp_free_pages);
 202         LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
 203
 204         /* max pool index before the release */
 205         p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
 206
 207         page_pools.epp_free_pages -= npages;
 208         page_pools.epp_total_pages -= npages;
 209
 210         /* max pool index after the release */
 211         p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
 212                      ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
 213
 214         p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
 215         g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
 216         LASSERT(page_pools.epp_pools[p_idx]);
 217
 218         while (npages--) {
 219                 LASSERT(page_pools.epp_pools[p_idx]);
 220                 LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
 221
 222                 cfs_free_page(page_pools.epp_pools[p_idx][g_idx]);
 223                 page_pools.epp_pools[p_idx][g_idx] = NULL;
 224
 225                 if (++g_idx == PAGES_PER_POOL) {
 226                         p_idx++;
 227                         g_idx = 0;
 228                 }
 229         };
 230
 231         /* free unused pools */
 232         while (p_idx_max1 < p_idx_max2) {
 233                 LASSERT(page_pools.epp_pools[p_idx_max2]);
 234                 OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE);
 235                 page_pools.epp_pools[p_idx_max2] = NULL;
 236                 p_idx_max2--;
 237         }
 238 }
 239
 240 /*
 241  * could be called frequently for query (@nr_to_scan == 0).
 242  * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
 243  */
 244 static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
 245 {
 246         if (unlikely(shrink_param(sc, nr_to_scan) != 0)) {
 247                 cfs_spin_lock(&page_pools.epp_lock);
 248                 shrink_param(sc, nr_to_scan) = min_t(unsigned long,
 249                                                    shrink_param(sc, nr_to_scan),
 250                                                    page_pools.epp_free_pages -
 251                                                    PTLRPC_MAX_BRW_PAGES);
 252                 if (shrink_param(sc, nr_to_scan) > 0) {
 253                         enc_pools_release_free_pages(shrink_param(sc,
 254                                                                   nr_to_scan));
 255                         CDEBUG(D_SEC, "released %ld pages, %ld left\n",
 256                                (long)shrink_param(sc, nr_to_scan),
 257                                page_pools.epp_free_pages);
 258
 259                         page_pools.epp_st_shrinks++;
 260                         page_pools.epp_last_shrink = cfs_time_current_sec();
 261                 }
 262                 cfs_spin_unlock(&page_pools.epp_lock);
 263         }
 264
 265         /*
 266          * if no pool access for a long time, we consider it's fully idle.
 267          * a little race here is fine.
 268          */
 269         if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access >
 270                      CACHE_QUIESCENT_PERIOD)) {
 271                 cfs_spin_lock(&page_pools.epp_lock);
 272                 page_pools.epp_idle_idx = IDLE_IDX_MAX;
 273                 cfs_spin_unlock(&page_pools.epp_lock);
 274         }
 275
 276         LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
 277         return max((int) page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
 278                (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
 279 }
 280
 281 static inline
 282 int npages_to_npools(unsigned long npages)
 283 {
 284         return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
 285 }
 286
 287 /*
 288  * return how many pages cleaned up.
 289  */
 290 static unsigned long enc_pools_cleanup(cfs_page_t ***pools, int npools)
 291 {
 292         unsigned long cleaned = 0;
 293         int           i, j;
 294
 295         for (i = 0; i < npools; i++) {
 296                 if (pools[i]) {
 297                         for (j = 0; j < PAGES_PER_POOL; j++) {
 298                                 if (pools[i][j]) {
 299                                         cfs_free_page(pools[i][j]);
 300                                         cleaned++;
 301                                 }
 302                         }
 303                         OBD_FREE(pools[i], CFS_PAGE_SIZE);
 304                         pools[i] = NULL;
 305                 }
 306         }
 307
 308         return cleaned;
 309 }
 310
 311 /*
 312  * merge @npools pointed by @pools which contains @npages new pages
 313  * into current pools.
 314  *
 315  * we have options to avoid most memory copy with some tricks. but we choose
 316  * the simplest way to avoid complexity. It's not frequently called.
 317  */
 318 static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages)
 319 {
 320         int     freeslot;
 321         int     op_idx, np_idx, og_idx, ng_idx;
 322         int     cur_npools, end_npools;
 323
 324         LASSERT(npages > 0);
 325         LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
 326         LASSERT(npages_to_npools(npages) == npools);
 327         LASSERT(page_pools.epp_growing);
 328
 329         cfs_spin_lock(&page_pools.epp_lock);
 330
 331         /*
 332          * (1) fill all the free slots of current pools.
 333          */
 334         /* free slots are those left by rent pages, and the extra ones with
 335          * index >= total_pages, locate at the tail of last pool. */
 336         freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
 337         if (freeslot != 0)
 338                 freeslot = PAGES_PER_POOL - freeslot;
 339         freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
 340
 341         op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
 342         og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
 343         np_idx = npools - 1;
 344         ng_idx = (npages - 1) % PAGES_PER_POOL;
 345
 346         while (freeslot) {
 347                 LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
 348                 LASSERT(pools[np_idx][ng_idx] != NULL);
 349
 350                 page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
 351                 pools[np_idx][ng_idx] = NULL;
 352
 353                 freeslot--;
 354
 355                 if (++og_idx == PAGES_PER_POOL) {
 356                         op_idx++;
 357                         og_idx = 0;
 358                 }
 359                 if (--ng_idx < 0) {
 360                         if (np_idx == 0)
 361                                 break;
 362                         np_idx--;
 363                         ng_idx = PAGES_PER_POOL - 1;
 364                 }
 365         }
 366
 367         /*
 368          * (2) add pools if needed.
 369          */
 370         cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
 371                      PAGES_PER_POOL;
 372         end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
 373                      PAGES_PER_POOL;
 374         LASSERT(end_npools <= page_pools.epp_max_pools);
 375
 376         np_idx = 0;
 377         while (cur_npools < end_npools) {
 378                 LASSERT(page_pools.epp_pools[cur_npools] == NULL);
 379                 LASSERT(np_idx < npools);
 380                 LASSERT(pools[np_idx] != NULL);
 381
 382                 page_pools.epp_pools[cur_npools++] = pools[np_idx];
 383                 pools[np_idx++] = NULL;
 384         }
 385
 386         page_pools.epp_total_pages += npages;
 387         page_pools.epp_free_pages += npages;
 388         page_pools.epp_st_lowfree = page_pools.epp_free_pages;
 389
 390         if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
 391                 page_pools.epp_st_max_pages = page_pools.epp_total_pages;
 392
 393         CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
 394                page_pools.epp_total_pages);
 395
 396         cfs_spin_unlock(&page_pools.epp_lock);
 397 }
 398
 399 static int enc_pools_add_pages(int npages)
 400 {
 401         static CFS_DECLARE_MUTEX(sem_add_pages);
 402         cfs_page_t   ***pools;
 403         int             npools, alloced = 0;
 404         int             i, j, rc = -ENOMEM;
 405
 406         if (npages < PTLRPC_MAX_BRW_PAGES)
 407                 npages = PTLRPC_MAX_BRW_PAGES;
 408
 409         cfs_down(&sem_add_pages);
 410
 411         if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
 412                 npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
 413         LASSERT(npages > 0);
 414
 415         page_pools.epp_st_grows++;
 416
 417         npools = npages_to_npools(npages);
 418         OBD_ALLOC(pools, npools * sizeof(*pools));
 419         if (pools == NULL)
 420                 goto out;
 421
 422         for (i = 0; i < npools; i++) {
 423                 OBD_ALLOC(pools[i], CFS_PAGE_SIZE);
 424                 if (pools[i] == NULL)
 425                         goto out_pools;
 426
 427                 for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
 428                         pools[i][j] = cfs_alloc_page(CFS_ALLOC_IO |
 429                                                      CFS_ALLOC_HIGH);
 430                         if (pools[i][j] == NULL)
 431                                 goto out_pools;
 432
 433                         alloced++;
 434                 }
 435         }
 436         LASSERT(alloced == npages);
 437
 438         enc_pools_insert(pools, npools, npages);
 439         CDEBUG(D_SEC, "added %d pages into pools\n", npages);
 440         rc = 0;
 441
 442 out_pools:
 443         enc_pools_cleanup(pools, npools);
 444         OBD_FREE(pools, npools * sizeof(*pools));
 445 out:
 446         if (rc) {
 447                 page_pools.epp_st_grow_fails++;
 448                 CERROR("Failed to allocate %d enc pages\n", npages);
 449         }
 450
 451         cfs_up(&sem_add_pages);
 452         return rc;
 453 }
 454
 455 static inline void enc_pools_wakeup(void)
 456 {
 457         LASSERT_SPIN_LOCKED(&page_pools.epp_lock);
 458         LASSERT(page_pools.epp_waitqlen >= 0);
 459
 460         if (unlikely(page_pools.epp_waitqlen)) {
 461                 LASSERT(cfs_waitq_active(&page_pools.epp_waitq));
 462                 cfs_waitq_broadcast(&page_pools.epp_waitq);
 463         }
 464 }
 465
 466 static int enc_pools_should_grow(int page_needed, long now)
 467 {
 468         /* don't grow if someone else is growing the pools right now,
 469          * or the pools has reached its full capacity
 470          */
 471         if (page_pools.epp_growing ||
 472             page_pools.epp_total_pages == page_pools.epp_max_pages)
 473                 return 0;
 474
 475         /* if total pages is not enough, we need to grow */
 476         if (page_pools.epp_total_pages < page_needed)
 477                 return 1;
 478
 479         /*
 480          * we wanted to return 0 here if there was a shrink just happened
 481          * moment ago, but this may cause deadlock if both client and ost
 482          * live on single node.
 483          */
 484 #if 0
 485         if (now - page_pools.epp_last_shrink < 2)
 486                 return 0;
 487 #endif
 488
 489         /*
 490          * here we perhaps need consider other factors like wait queue
 491          * length, idle index, etc. ?
 492          */
 493
 494         /* grow the pools in any other cases */
 495         return 1;
 496 }
 497
 498 /*
 499  * we allocate the requested pages atomically.
 500  */
 501 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 502 {
 503         cfs_waitlink_t  waitlink;
 504         unsigned long   this_idle = -1;
 505         cfs_time_t      tick = 0;
 506         long            now;
 507         int             p_idx, g_idx;
 508         int             i;
 509
 510         LASSERT(desc->bd_iov_count > 0);
 511         LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
 512
 513         /* resent bulk, enc iov might have been allocated previously */
 514         if (desc->bd_enc_iov != NULL)
 515                 return 0;
 516
 517         OBD_ALLOC(desc->bd_enc_iov,
 518                   desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
 519         if (desc->bd_enc_iov == NULL)
 520                 return -ENOMEM;
 521
 522         cfs_spin_lock(&page_pools.epp_lock);
 523
 524         page_pools.epp_st_access++;
 525 again:
 526         if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
 527                 if (tick == 0)
 528                         tick = cfs_time_current();
 529
 530                 now = cfs_time_current_sec();
 531
 532                 page_pools.epp_st_missings++;
 533                 page_pools.epp_pages_short += desc->bd_iov_count;
 534
 535                 if (enc_pools_should_grow(desc->bd_iov_count, now)) {
 536                         page_pools.epp_growing = 1;
 537
 538                         cfs_spin_unlock(&page_pools.epp_lock);
 539                         enc_pools_add_pages(page_pools.epp_pages_short / 2);
 540                         cfs_spin_lock(&page_pools.epp_lock);
 541
 542                         page_pools.epp_growing = 0;
 543
 544                         enc_pools_wakeup();
 545                 } else {
 546                         if (++page_pools.epp_waitqlen >
 547                             page_pools.epp_st_max_wqlen)
 548                                 page_pools.epp_st_max_wqlen =
 549                                                 page_pools.epp_waitqlen;
 550
 551                         cfs_set_current_state(CFS_TASK_UNINT);
 552                         cfs_waitlink_init(&waitlink);
 553                         cfs_waitq_add(&page_pools.epp_waitq, &waitlink);
 554
 555                         cfs_spin_unlock(&page_pools.epp_lock);
 556                         cfs_waitq_wait(&waitlink, CFS_TASK_UNINT);
 557                         cfs_waitq_del(&page_pools.epp_waitq, &waitlink);
 558                         LASSERT(page_pools.epp_waitqlen > 0);
 559                         cfs_spin_lock(&page_pools.epp_lock);
 560                         page_pools.epp_waitqlen--;
 561                 }
 562
 563                 LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
 564                 page_pools.epp_pages_short -= desc->bd_iov_count;
 565
 566                 this_idle = 0;
 567                 goto again;
 568         }
 569
 570         /* record max wait time */
 571         if (unlikely(tick != 0)) {
 572                 tick = cfs_time_current() - tick;
 573                 if (tick > page_pools.epp_st_max_wait)
 574                         page_pools.epp_st_max_wait = tick;
 575         }
 576
 577         /* proceed with rest of allocation */
 578         page_pools.epp_free_pages -= desc->bd_iov_count;
 579
 580         p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
 581         g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
 582
 583         for (i = 0; i < desc->bd_iov_count; i++) {
 584                 LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
 585                 desc->bd_enc_iov[i].kiov_page =
 586                                         page_pools.epp_pools[p_idx][g_idx];
 587                 page_pools.epp_pools[p_idx][g_idx] = NULL;
 588
 589                 if (++g_idx == PAGES_PER_POOL) {
 590                         p_idx++;
 591                         g_idx = 0;
 592                 }
 593         }
 594
 595         if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
 596                 page_pools.epp_st_lowfree = page_pools.epp_free_pages;
 597
 598         /*
 599          * new idle index = (old * weight + new) / (weight + 1)
 600          */
 601         if (this_idle == -1) {
 602                 this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
 603                             page_pools.epp_total_pages;
 604         }
 605         page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
 606                                    this_idle) /
 607                                   (IDLE_IDX_WEIGHT + 1);
 608
 609         page_pools.epp_last_access = cfs_time_current_sec();
 610
 611         cfs_spin_unlock(&page_pools.epp_lock);
 612         return 0;
 613 }
 614 EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
 615
 616 void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
 617 {
 618         int     p_idx, g_idx;
 619         int     i;
 620
 621         if (desc->bd_enc_iov == NULL)
 622                 return;
 623
 624         LASSERT(desc->bd_iov_count > 0);
 625
 626         cfs_spin_lock(&page_pools.epp_lock);
 627
 628         p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
 629         g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
 630
 631         LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
 632                 page_pools.epp_total_pages);
 633         LASSERT(page_pools.epp_pools[p_idx]);
 634
 635         for (i = 0; i < desc->bd_iov_count; i++) {
 636                 LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
 637                 LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
 638                 LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
 639
 640                 page_pools.epp_pools[p_idx][g_idx] =
 641                                         desc->bd_enc_iov[i].kiov_page;
 642
 643                 if (++g_idx == PAGES_PER_POOL) {
 644                         p_idx++;
 645                         g_idx = 0;
 646                 }
 647         }
 648
 649         page_pools.epp_free_pages += desc->bd_iov_count;
 650
 651         enc_pools_wakeup();
 652
 653         cfs_spin_unlock(&page_pools.epp_lock);
 654
 655         OBD_FREE(desc->bd_enc_iov,
 656                  desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
 657         desc->bd_enc_iov = NULL;
 658 }
 659 EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
 660
 661 /*
 662  * we don't do much stuff for add_user/del_user anymore, except adding some
 663  * initial pages in add_user() if current pools are empty, rest would be
 664  * handled by the pools's self-adaption.
 665  */
 666 int sptlrpc_enc_pool_add_user(void)
 667 {
 668         int     need_grow = 0;
 669
 670         cfs_spin_lock(&page_pools.epp_lock);
 671         if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
 672                 page_pools.epp_growing = 1;
 673                 need_grow = 1;
 674         }
 675         cfs_spin_unlock(&page_pools.epp_lock);
 676
 677         if (need_grow) {
 678                 enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
 679                                     PTLRPC_MAX_BRW_PAGES);
 680
 681                 cfs_spin_lock(&page_pools.epp_lock);
 682                 page_pools.epp_growing = 0;
 683                 enc_pools_wakeup();
 684                 cfs_spin_unlock(&page_pools.epp_lock);
 685         }
 686         return 0;
 687 }
 688 EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
 689
 690 int sptlrpc_enc_pool_del_user(void)
 691 {
 692         return 0;
 693 }
 694 EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
 695
 696 static inline void enc_pools_alloc(void)
 697 {
 698         LASSERT(page_pools.epp_max_pools);
 699         OBD_ALLOC_LARGE(page_pools.epp_pools,
 700                         page_pools.epp_max_pools *
 701                         sizeof(*page_pools.epp_pools));
 702 }
 703
 704 static inline void enc_pools_free(void)
 705 {
 706         LASSERT(page_pools.epp_max_pools);
 707         LASSERT(page_pools.epp_pools);
 708
 709         OBD_FREE_LARGE(page_pools.epp_pools,
 710                        page_pools.epp_max_pools *
 711                        sizeof(*page_pools.epp_pools));
 712 }
 713
 714 int sptlrpc_enc_pool_init(void)
 715 {
 716         /*
 717          * maximum capacity is 1/8 of total physical memory.
 718          * is the 1/8 a good number?
 719          */
 720         page_pools.epp_max_pages = cfs_num_physpages / 8;
 721         page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
 722
 723         cfs_waitq_init(&page_pools.epp_waitq);
 724         page_pools.epp_waitqlen = 0;
 725         page_pools.epp_pages_short = 0;
 726
 727         page_pools.epp_growing = 0;
 728
 729         page_pools.epp_idle_idx = 0;
 730         page_pools.epp_last_shrink = cfs_time_current_sec();
 731         page_pools.epp_last_access = cfs_time_current_sec();
 732
 733         cfs_spin_lock_init(&page_pools.epp_lock);
 734         page_pools.epp_total_pages = 0;
 735         page_pools.epp_free_pages = 0;
 736
 737         page_pools.epp_st_max_pages = 0;
 738         page_pools.epp_st_grows = 0;
 739         page_pools.epp_st_grow_fails = 0;
 740         page_pools.epp_st_shrinks = 0;
 741         page_pools.epp_st_access = 0;
 742         page_pools.epp_st_missings = 0;
 743         page_pools.epp_st_lowfree = 0;
 744         page_pools.epp_st_max_wqlen = 0;
 745         page_pools.epp_st_max_wait = 0;
 746
 747         enc_pools_alloc();
 748         if (page_pools.epp_pools == NULL)
 749                 return -ENOMEM;
 750
 751         pools_shrinker = cfs_set_shrinker(pools_shrinker_seeks,
 752                                           enc_pools_shrink);
 753         if (pools_shrinker == NULL) {
 754                 enc_pools_free();
 755                 return -ENOMEM;
 756         }
 757
 758         return 0;
 759 }
 760
 761 void sptlrpc_enc_pool_fini(void)
 762 {
 763         unsigned long cleaned, npools;
 764
 765         LASSERT(pools_shrinker);
 766         LASSERT(page_pools.epp_pools);
 767         LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
 768
 769         cfs_remove_shrinker(pools_shrinker);
 770
 771         npools = npages_to_npools(page_pools.epp_total_pages);
 772         cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
 773         LASSERT(cleaned == page_pools.epp_total_pages);
 774
 775         enc_pools_free();
 776
 777         if (page_pools.epp_st_access > 0) {
 778                 CDEBUG(D_SEC,
 779                        "max pages %lu, grows %u, grow fails %u, shrinks %u, "
 780                        "access %lu, missing %lu, max qlen %u, max wait "
 781                        CFS_TIME_T"/%d\n",
 782                        page_pools.epp_st_max_pages, page_pools.epp_st_grows,
 783                        page_pools.epp_st_grow_fails,
 784                        page_pools.epp_st_shrinks, page_pools.epp_st_access,
 785                        page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
 786                        page_pools.epp_st_max_wait, CFS_HZ);
 787         }
 788 }
 789
 790 #else /* !__KERNEL__ */
 791
 792 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 793 {
 794         return 0;
 795 }
 796
 797 void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
 798 {
 799 }
 800
 801 int sptlrpc_enc_pool_init(void)
 802 {
 803         return 0;
 804 }
 805
 806 void sptlrpc_enc_pool_fini(void)
 807 {
 808 }
 809 #endif
 810
 811 /****************************************
 812  * Helpers to assist policy modules to  *
 813  * implement checksum funcationality    *
 814  ****************************************/
 815
 816 static struct sptlrpc_hash_type hash_types[] = {
 817         [BULK_HASH_ALG_NULL]    = { "null",     "null",         0 },
 818         [BULK_HASH_ALG_ADLER32] = { "adler32",  "adler32",      4 },
 819         [BULK_HASH_ALG_CRC32]   = { "crc32",    "crc32",        4 },
 820         [BULK_HASH_ALG_MD5]     = { "md5",      "md5",          16 },
 821         [BULK_HASH_ALG_SHA1]    = { "sha1",     "sha1",         20 },
 822         [BULK_HASH_ALG_SHA256]  = { "sha256",   "sha256",       32 },
 823         [BULK_HASH_ALG_SHA384]  = { "sha384",   "sha384",       48 },
 824         [BULK_HASH_ALG_SHA512]  = { "sha512",   "sha512",       64 },
 825 };
 826
 827 const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg)
 828 {
 829         struct sptlrpc_hash_type *ht;
 830
 831         if (hash_alg < BULK_HASH_ALG_MAX) {
 832                 ht = &hash_types[hash_alg];
 833                 if (ht->sht_tfm_name)
 834                         return ht;
 835         }
 836         return NULL;
 837 }
 838 EXPORT_SYMBOL(sptlrpc_get_hash_type);
 839
 840 const char * sptlrpc_get_hash_name(__u8 hash_alg)
 841 {
 842         const struct sptlrpc_hash_type *ht;
 843
 844         ht = sptlrpc_get_hash_type(hash_alg);
 845         if (ht)
 846                 return ht->sht_name;
 847         else
 848                 return "unknown";
 849 }
 850 EXPORT_SYMBOL(sptlrpc_get_hash_name);
 851
 852 __u8 sptlrpc_get_hash_alg(const char *algname)
 853 {
 854         int     i;
 855
 856         for (i = 0; i < BULK_HASH_ALG_MAX; i++)
 857                 if (!strcmp(hash_types[i].sht_name, algname))
 858                         break;
 859         return i;
 860 }
 861 EXPORT_SYMBOL(sptlrpc_get_hash_alg);
 862
 863 int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
 864 {
 865         struct ptlrpc_bulk_sec_desc *bsd;
 866         int                          size = msg->lm_buflens[offset];
 867
 868         bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
 869         if (bsd == NULL) {
 870                 CERROR("Invalid bulk sec desc: size %d\n", size);
 871                 return -EINVAL;
 872         }
 873
 874         if (swabbed) {
 875                 __swab32s(&bsd->bsd_nob);
 876         }
 877
 878         if (unlikely(bsd->bsd_version != 0)) {
 879                 CERROR("Unexpected version %u\n", bsd->bsd_version);
 880                 return -EPROTO;
 881         }
 882
 883         if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
 884                 CERROR("Invalid type %u\n", bsd->bsd_type);
 885                 return -EPROTO;
 886         }
 887
 888         /* FIXME more sanity check here */
 889
 890         if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
 891                      bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
 892                      bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
 893                 CERROR("Invalid svc %u\n", bsd->bsd_svc);
 894                 return -EPROTO;
 895         }
 896
 897         return 0;
 898 }
 899 EXPORT_SYMBOL(bulk_sec_desc_unpack);
 900
 901 #ifdef __KERNEL__
 902
 903 #ifdef HAVE_ADLER
 904 static int do_bulk_checksum_adler32(struct ptlrpc_bulk_desc *desc, void *buf)
 905 {
 906         struct page    *page;
 907         int             off;
 908         char           *ptr;
 909         __u32           adler32 = 1;
 910         int             len, i;
 911
 912         for (i = 0; i < desc->bd_iov_count; i++) {
 913                 page = desc->bd_iov[i].kiov_page;
 914                 off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
 915                 ptr = cfs_kmap(page) + off;
 916                 len = desc->bd_iov[i].kiov_len;
 917
 918                 adler32 = adler32(adler32, ptr, len);
 919
 920                 cfs_kunmap(page);
 921         }
 922
 923         adler32 = cpu_to_le32(adler32);
 924         memcpy(buf, &adler32, sizeof(adler32));
 925         return 0;
 926 }
 927 #endif
 928
 929 static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf)
 930 {
 931         struct page    *page;
 932         int             off;
 933         char           *ptr;
 934         __u32           crc32 = ~0;
 935         int             len, i;
 936
 937         for (i = 0; i < desc->bd_iov_count; i++) {
 938                 page = desc->bd_iov[i].kiov_page;
 939                 off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
 940                 ptr = cfs_kmap(page) + off;
 941                 len = desc->bd_iov[i].kiov_len;
 942
 943                 crc32 = crc32_le(crc32, ptr, len);
 944
 945                 cfs_kunmap(page);
 946         }
 947
 948         crc32 = cpu_to_le32(crc32);
 949         memcpy(buf, &crc32, sizeof(crc32));
 950         return 0;
 951 }
 952
 953 int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 954                               void *buf, int buflen)
 955 {
 956         struct hash_desc    hdesc;
 957         int                 hashsize;
 958         char                hashbuf[64];
 959         struct scatterlist  sl;
 960         int                 i;
 961
 962         LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
 963         LASSERT(buflen >= 4);
 964
 965         switch (alg) {
 966         case BULK_HASH_ALG_ADLER32:
 967 #ifdef HAVE_ADLER
 968                 return do_bulk_checksum_adler32(desc, buf);
 969 #else
 970                 CERROR("Adler32 not supported\n");
 971                 return -EINVAL;
 972 #endif
 973         case BULK_HASH_ALG_CRC32:
 974                 return do_bulk_checksum_crc32(desc, buf);
 975         }
 976
 977         hdesc.tfm = ll_crypto_alloc_hash(hash_types[alg].sht_tfm_name, 0, 0);
 978         if (hdesc.tfm == NULL) {
 979                 CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name);
 980                 return -ENOMEM;
 981         }
 982
 983         hdesc.flags = 0;
 984         ll_crypto_hash_init(&hdesc);
 985
 986         hashsize = ll_crypto_hash_digestsize(hdesc.tfm);
 987
 988         for (i = 0; i < desc->bd_iov_count; i++) {
 989                 sg_set_page(&sl, desc->bd_iov[i].kiov_page,
 990                              desc->bd_iov[i].kiov_len,
 991                              desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK);
 992                 ll_crypto_hash_update(&hdesc, &sl, sl.length);
 993         }
 994
 995         if (hashsize > buflen) {
 996                 ll_crypto_hash_final(&hdesc, hashbuf);
 997                 memcpy(buf, hashbuf, buflen);
 998         } else {
 999                 ll_crypto_hash_final(&hdesc, buf);
1000         }
1001
1002         ll_crypto_free_hash(hdesc.tfm);
1003         return 0;
1004 }
1005 EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
1006
1007 #else /* !__KERNEL__ */
1008
1009 int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
1010                               void *buf, int buflen)
1011 {
1012         __u32   csum32;
1013         int     i;
1014
1015         LASSERT(alg == BULK_HASH_ALG_ADLER32 || alg == BULK_HASH_ALG_CRC32);
1016
1017         if (alg == BULK_HASH_ALG_ADLER32)
1018                 csum32 = 1;
1019         else
1020                 csum32 = ~0;
1021
1022         for (i = 0; i < desc->bd_iov_count; i++) {
1023                 unsigned char *ptr = desc->bd_iov[i].iov_base;
1024                 int len = desc->bd_iov[i].iov_len;
1025
1026                 switch (alg) {
1027                 case BULK_HASH_ALG_ADLER32:
1028 #ifdef HAVE_ADLER
1029                         csum32 = adler32(csum32, ptr, len);
1030 #else
1031                         CERROR("Adler32 not supported\n");
1032                         return -EINVAL;
1033 #endif
1034                         break;
1035                 case BULK_HASH_ALG_CRC32:
1036                         csum32 = crc32_le(csum32, ptr, len);
1037                         break;
1038                 }
1039         }
1040
1041         csum32 = cpu_to_le32(csum32);
1042         memcpy(buf, &csum32, sizeof(csum32));
1043         return 0;
1044 }
1045
1046 #endif /* __KERNEL__ */