lustre/osc/osc_request.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
   5  *   Author Peter Braam <braam@clusterfs.com>
   6  *
   7  *   This file is part of Lustre, http://www.lustre.org.
   8  *
   9  *   Lustre is free software; you can redistribute it and/or
  10  *   modify it under the terms of version 2 of the GNU General Public
  11  *   License as published by the Free Software Foundation.
  12  *
  13  *   Lustre is distributed in the hope that it will be useful,
  14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *   GNU General Public License for more details.
  17  *
  18  *   You should have received a copy of the GNU General Public License
  19  *   along with Lustre; if not, write to the Free Software
  20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21  *
  22  *  For testing and management it is treated as an obd_device,
  23  *  although * it does not export a full OBD method table (the
  24  *  requests are coming * in over the wire, so object target modules
  25  *  do not have a full * method table.)
  26  *
  27  */
  28
  29 #ifndef EXPORT_SYMTAB
  30 # define EXPORT_SYMTAB
  31 #endif
  32 #define DEBUG_SUBSYSTEM S_OSC
  33
  34 #ifdef __KERNEL__
  35 # include <linux/version.h>
  36 # include <linux/module.h>
  37 # include <linux/mm.h>
  38 # include <linux/highmem.h>
  39 # include <linux/ctype.h>
  40 # include <linux/init.h>
  41 # if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  42 #  include <linux/workqueue.h>
  43 #  include <linux/smp_lock.h>
  44 # else
  45 #  include <linux/locks.h>
  46 # endif
  47 #else /* __KERNEL__ */
  48 # include <liblustre.h>
  49 #endif
  50
  51 #include <linux/lustre_dlm.h>
  52 #include <linux/kp30.h>
  53 #include <linux/lustre_net.h>
  54 #include <lustre/lustre_user.h>
  55 #include <linux/obd_ost.h>
  56 #include <linux/obd_lov.h>
  57
  58 #ifdef  __CYGWIN__
  59 # include <ctype.h>
  60 #endif
  61
  62 #include <linux/lustre_ha.h>
  63 #include <linux/lprocfs_status.h>
  64 #include <linux/lustre_log.h>
  65 #include "osc_internal.h"
  66
  67 /* Pack OSC object metadata for disk storage (LE byte order). */
  68 static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
  69                       struct lov_stripe_md *lsm)
  70 {
  71         int lmm_size;
  72         ENTRY;
  73
  74         lmm_size = sizeof(**lmmp);
  75         if (!lmmp)
  76                 RETURN(lmm_size);
  77
  78         if (*lmmp && !lsm) {
  79                 OBD_FREE(*lmmp, lmm_size);
  80                 *lmmp = NULL;
  81                 RETURN(0);
  82         }
  83
  84         if (!*lmmp) {
  85                 OBD_ALLOC(*lmmp, lmm_size);
  86                 if (!*lmmp)
  87                         RETURN(-ENOMEM);
  88         }
  89
  90         if (lsm) {
  91                 LASSERT(lsm->lsm_object_id);
  92                 LASSERT(lsm->lsm_object_gr);
  93                 (*lmmp)->lmm_object_id = cpu_to_le64(lsm->lsm_object_id);
  94                 (*lmmp)->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr);
  95         }
  96
  97         RETURN(lmm_size);
  98 }
  99
 100 /* Unpack OSC object metadata from disk storage (LE byte order). */
 101 static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
 102                         struct lov_mds_md *lmm, int lmm_bytes)
 103 {
 104         int lsm_size;
 105         ENTRY;
 106
 107         if (lmm != NULL) {
 108                 if (lmm_bytes < sizeof (*lmm)) {
 109                         CERROR("lov_mds_md too small: %d, need %d\n",
 110                                lmm_bytes, (int)sizeof(*lmm));
 111                         RETURN(-EINVAL);
 112                 }
 113                 /* XXX LOV_MAGIC etc check? */
 114
 115                 if (lmm->lmm_object_id == 0) {
 116                         CERROR("lov_mds_md: zero lmm_object_id\n");
 117                         RETURN(-EINVAL);
 118                 }
 119         }
 120
 121         lsm_size = lov_stripe_md_size(1);
 122         if (lsmp == NULL)
 123                 RETURN(lsm_size);
 124
 125         if (*lsmp != NULL && lmm == NULL) {
 126                 OBD_FREE(*lsmp, lsm_size);
 127                 *lsmp = NULL;
 128                 RETURN(0);
 129         }
 130
 131         if (*lsmp == NULL) {
 132                 OBD_ALLOC(*lsmp, lsm_size);
 133                 if (*lsmp == NULL)
 134                         RETURN(-ENOMEM);
 135                 loi_init((*lsmp)->lsm_oinfo);
 136         }
 137
 138         if (lmm != NULL) {
 139                 /* XXX zero *lsmp? */
 140                 (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id);
 141                 (*lsmp)->lsm_object_gr = le64_to_cpu (lmm->lmm_object_gr);
 142                 LASSERT((*lsmp)->lsm_object_id);
 143                 LASSERT((*lsmp)->lsm_object_gr);
 144         }
 145
 146         (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
 147
 148         RETURN(lsm_size);
 149 }
 150
 151 static int osc_getattr_interpret(struct ptlrpc_request *req,
 152                                  struct osc_getattr_async_args *aa, int rc)
 153 {
 154         struct ost_body *body;
 155         ENTRY;
 156
 157         if (rc != 0)
 158                 RETURN(rc);
 159
 160         body = lustre_swab_repbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
 161         if (body) {
 162                 CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
 163                 memcpy(aa->aa_oa, &body->oa, sizeof(*aa->aa_oa));
 164
 165                 /* This should really be sent by the OST */
 166                 aa->aa_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
 167                 aa->aa_oa->o_valid |= OBD_MD_FLBLKSZ;
 168         } else {
 169                 CERROR("can't unpack ost_body\n");
 170                 rc = -EPROTO;
 171                 aa->aa_oa->o_valid = 0;
 172         }
 173
 174         RETURN(rc);
 175 }
 176
 177 static int osc_getattr_async(struct obd_export *exp, struct obdo *oa,
 178                              struct lov_stripe_md *md,
 179                              struct ptlrpc_request_set *set)
 180 {
 181         struct ptlrpc_request *request;
 182         struct ost_body *body;
 183         int size = sizeof(*body);
 184         struct osc_getattr_async_args *aa;
 185         ENTRY;
 186
 187         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 188                                   OST_GETATTR, 1, &size, NULL);
 189         if (!request)
 190                 RETURN(-ENOMEM);
 191
 192         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 193         memcpy(&body->oa, oa, sizeof(*oa));
 194
 195         request->rq_replen = lustre_msg_size(1, &size);
 196         request->rq_interpret_reply = osc_getattr_interpret;
 197
 198         LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args));
 199         aa = (struct osc_getattr_async_args *)&request->rq_async_args;
 200         aa->aa_oa = oa;
 201
 202         ptlrpc_set_add_req (set, request);
 203         RETURN (0);
 204 }
 205
 206 static int osc_getattr(struct obd_export *exp, struct obdo *oa,
 207                        struct lov_stripe_md *md)
 208 {
 209         struct ptlrpc_request *request;
 210         struct ost_body *body;
 211         int rc, size = sizeof(*body);
 212         ENTRY;
 213
 214         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 215                                   OST_GETATTR, 1, &size, NULL);
 216         if (!request)
 217                 RETURN(-ENOMEM);
 218
 219         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 220         memcpy(&body->oa, oa, sizeof(*oa));
 221
 222         request->rq_replen = lustre_msg_size(1, &size);
 223
 224         rc = ptlrpc_queue_wait(request);
 225         if (rc) {
 226                 CERROR("%s failed: rc = %d\n", __FUNCTION__, rc);
 227                 GOTO(out, rc);
 228         }
 229
 230         body = lustre_swab_repbuf(request, 0, sizeof (*body),
 231                                   lustre_swab_ost_body);
 232         if (body == NULL) {
 233                 CERROR ("can't unpack ost_body\n");
 234                 GOTO (out, rc = -EPROTO);
 235         }
 236
 237         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
 238         memcpy(oa, &body->oa, sizeof(*oa));
 239
 240         /* This should really be sent by the OST */
 241         oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
 242         oa->o_valid |= OBD_MD_FLBLKSZ;
 243
 244         EXIT;
 245  out:
 246         ptlrpc_req_finished(request);
 247         return rc;
 248 }
 249
 250 static int osc_setattr(struct obd_export *exp, struct obdo *oa,
 251                        struct lov_stripe_md *md, struct obd_trans_info *oti)
 252 {
 253         struct ptlrpc_request *request;
 254         struct ost_body *body;
 255         int rc, size = sizeof(*body);
 256         ENTRY;
 257
 258         LASSERT(!(oa->o_valid & OBD_MD_FLGROUP) || oa->o_gr > 0);
 259
 260         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 261                                   OST_SETATTR, 1, &size, NULL);
 262         if (!request)
 263                 RETURN(-ENOMEM);
 264
 265         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof(*body));
 266         memcpy(&body->oa, oa, sizeof(*oa));
 267
 268         request->rq_replen = lustre_msg_size(1, &size);
 269
 270         rc = ptlrpc_queue_wait(request);
 271         if (rc)
 272                 GOTO(out, rc);
 273
 274         body = lustre_swab_repbuf(request, 0, sizeof(*body),
 275                                   lustre_swab_ost_body);
 276         if (body == NULL)
 277                 GOTO(out, rc = -EPROTO);
 278
 279         memcpy(oa, &body->oa, sizeof(*oa));
 280
 281         EXIT;
 282 out:
 283         ptlrpc_req_finished(request);
 284         RETURN(0);
 285 }
 286
 287 int osc_real_create(struct obd_export *exp, struct obdo *oa,
 288                     struct lov_stripe_md **ea, struct obd_trans_info *oti)
 289 {
 290         struct ptlrpc_request *request;
 291         struct ost_body *body;
 292         struct lov_stripe_md *lsm;
 293         int rc, size = sizeof(*body);
 294         ENTRY;
 295
 296         LASSERT(oa);
 297         LASSERT(ea);
 298
 299         lsm = *ea;
 300         if (!lsm) {
 301                 rc = obd_alloc_memmd(exp, &lsm);
 302                 if (rc < 0)
 303                         RETURN(rc);
 304         }
 305
 306         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 307                                   OST_CREATE, 1, &size, NULL);
 308         if (!request)
 309                 GOTO(out, rc = -ENOMEM);
 310
 311         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 312         memcpy(&body->oa, oa, sizeof(body->oa));
 313
 314         request->rq_replen = lustre_msg_size(1, &size);
 315         if (oa->o_valid & OBD_MD_FLINLINE) {
 316                 LASSERT((oa->o_valid & OBD_MD_FLFLAGS) &&
 317                         oa->o_flags == OBD_FL_DELORPHAN);
 318                 DEBUG_REQ(D_HA, request,
 319                           "delorphan from OST integration");
 320                 /* Don't resend the delorphan request */
 321                 request->rq_no_resend = request->rq_no_delay = 1;
 322         }
 323
 324         rc = ptlrpc_queue_wait(request);
 325         if (rc)
 326                 GOTO(out_req, rc);
 327
 328         body = lustre_swab_repbuf(request, 0, sizeof(*body),
 329                                   lustre_swab_ost_body);
 330         if (body == NULL) {
 331                 CERROR ("can't unpack ost_body\n");
 332                 GOTO (out_req, rc = -EPROTO);
 333         }
 334
 335         memcpy(oa, &body->oa, sizeof(*oa));
 336
 337         /* This should really be sent by the OST */
 338         oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
 339         oa->o_valid |= OBD_MD_FLBLKSZ;
 340
 341         /* XXX LOV STACKING: the lsm that is passed to us from LOV does not
 342          * have valid lsm_oinfo data structs, so don't go touching that.
 343          * This needs to be fixed in a big way.
 344          */
 345         lsm->lsm_object_id = oa->o_id;
 346         lsm->lsm_object_gr = oa->o_gr;
 347         *ea = lsm;
 348
 349         if (oti != NULL) {
 350                 oti->oti_transno = request->rq_repmsg->transno;
 351
 352                 if (oa->o_valid & OBD_MD_FLCOOKIE) {
 353                         if (!oti->oti_logcookies)
 354                                 oti_alloc_cookies(oti, 1);
 355                         memcpy(oti->oti_logcookies, obdo_logcookie(oa),
 356                                sizeof(oti->oti_onecookie));
 357                 }
 358         }
 359
 360         CDEBUG(D_HA, "transno: "LPD64"\n", request->rq_repmsg->transno);
 361         EXIT;
 362 out_req:
 363         ptlrpc_req_finished(request);
 364 out:
 365         if (rc && !*ea)
 366                 obd_free_memmd(exp, &lsm);
 367         return rc;
 368 }
 369
 370 static int osc_punch(struct obd_export *exp, struct obdo *oa,
 371                      struct lov_stripe_md *md, obd_size start,
 372                      obd_size end, struct obd_trans_info *oti)
 373 {
 374         struct ptlrpc_request *request;
 375         struct ost_body *body;
 376         int rc, size = sizeof(*body);
 377         ENTRY;
 378
 379         if (!oa) {
 380                 CERROR("oa NULL\n");
 381                 RETURN(-EINVAL);
 382         }
 383
 384         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 385                                   OST_PUNCH, 1, &size, NULL);
 386         if (!request)
 387                 RETURN(-ENOMEM);
 388
 389         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 390         memcpy(&body->oa, oa, sizeof(*oa));
 391
 392         /* overload the size and blocks fields in the oa with start/end */
 393         body->oa.o_size = start;
 394         body->oa.o_blocks = end;
 395         body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
 396
 397         request->rq_replen = lustre_msg_size(1, &size);
 398
 399         rc = ptlrpc_queue_wait(request);
 400         if (rc)
 401                 GOTO(out, rc);
 402
 403         body = lustre_swab_repbuf (request, 0, sizeof (*body),
 404                                    lustre_swab_ost_body);
 405         if (body == NULL) {
 406                 CERROR ("can't unpack ost_body\n");
 407                 GOTO (out, rc = -EPROTO);
 408         }
 409
 410         memcpy(oa, &body->oa, sizeof(*oa));
 411
 412         EXIT;
 413  out:
 414         ptlrpc_req_finished(request);
 415         return rc;
 416 }
 417
 418 static int osc_sync(struct obd_export *exp, struct obdo *oa,
 419                     struct lov_stripe_md *md, obd_size start, obd_size end)
 420 {
 421         struct ptlrpc_request *request;
 422         struct ost_body *body;
 423         int rc, size = sizeof(*body);
 424         ENTRY;
 425
 426         if (!oa) {
 427                 CERROR("oa NULL\n");
 428                 RETURN(-EINVAL);
 429         }
 430
 431         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 432                                   OST_SYNC, 1, &size, NULL);
 433         if (!request)
 434                 RETURN(-ENOMEM);
 435
 436         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 437         memcpy(&body->oa, oa, sizeof(*oa));
 438
 439         /* overload the size and blocks fields in the oa with start/end */
 440         body->oa.o_size = start;
 441         body->oa.o_blocks = end;
 442         body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
 443
 444         request->rq_replen = lustre_msg_size(1, &size);
 445
 446         rc = ptlrpc_queue_wait(request);
 447         if (rc)
 448                 GOTO(out, rc);
 449
 450         body = lustre_swab_repbuf(request, 0, sizeof(*body),
 451                                   lustre_swab_ost_body);
 452         if (body == NULL) {
 453                 CERROR ("can't unpack ost_body\n");
 454                 GOTO (out, rc = -EPROTO);
 455         }
 456
 457         memcpy(oa, &body->oa, sizeof(*oa));
 458
 459         EXIT;
 460  out:
 461         ptlrpc_req_finished(request);
 462         return rc;
 463 }
 464
 465 static int osc_destroy(struct obd_export *exp, struct obdo *oa,
 466                        struct lov_stripe_md *ea, struct obd_trans_info *oti)
 467 {
 468         struct ptlrpc_request *request;
 469         struct ost_body *body;
 470         int rc, size = sizeof(*body);
 471         ENTRY;
 472
 473         if (!oa) {
 474                 CERROR("oa NULL\n");
 475                 RETURN(-EINVAL);
 476         }
 477
 478         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
 479                                   OST_DESTROY, 1, &size, NULL);
 480         if (!request)
 481                 RETURN(-ENOMEM);
 482
 483         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
 484
 485         if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) {
 486                 memcpy(obdo_logcookie(oa), oti->oti_logcookies,
 487                        sizeof(*oti->oti_logcookies));
 488                 oti->oti_logcookies++;
 489         }
 490
 491         memcpy(&body->oa, oa, sizeof(*oa));
 492         request->rq_replen = lustre_msg_size(1, &size);
 493
 494         rc = ptlrpc_queue_wait(request);
 495
 496         if (rc == -ENOENT)
 497                 rc = 0;
 498         if (rc)
 499                 GOTO(out, rc);
 500
 501         body = lustre_swab_repbuf(request, 0, sizeof(*body),
 502                                   lustre_swab_ost_body);
 503         if (body == NULL) {
 504                 CERROR ("Can't unpack body\n");
 505                 GOTO (out, rc = -EPROTO);
 506         }
 507
 508         memcpy(oa, &body->oa, sizeof(*oa));
 509
 510         EXIT;
 511  out:
 512         ptlrpc_req_finished(request);
 513         return rc;
 514 }
 515
 516 static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 517                                 long writing_bytes)
 518 {
 519         obd_valid bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
 520
 521         LASSERT(!(oa->o_valid & bits));
 522
 523         oa->o_valid |= bits;
 524         spin_lock(&cli->cl_loi_list_lock);
 525         oa->o_dirty = cli->cl_dirty;
 526         oa->o_undirty = cli->cl_dirty_max - oa->o_dirty;
 527         oa->o_grant = cli->cl_avail_grant;
 528         oa->o_dropped = cli->cl_lost_grant;
 529         cli->cl_lost_grant = 0;
 530         spin_unlock(&cli->cl_loi_list_lock);
 531         CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n",
 532                oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
 533 }
 534
 535 /* caller must hold loi_list_lock */
 536 static void osc_consume_write_grant(struct client_obd *cli,
 537                                     struct osc_async_page *oap)
 538 {
 539         cli->cl_dirty += PAGE_SIZE;
 540         cli->cl_avail_grant -= PAGE_SIZE;
 541         oap->oap_brw_flags |= OBD_BRW_FROM_GRANT;
 542         CDEBUG(D_CACHE, "using %lu grant credits for oap %p\n", PAGE_SIZE, oap);
 543         LASSERT(cli->cl_avail_grant >= 0);
 544 }
 545
 546 static unsigned long rpcs_in_flight(struct client_obd *cli)
 547 {
 548         return cli->cl_r_in_flight + cli->cl_w_in_flight;
 549 }
 550
 551 /* caller must hold loi_list_lock */
 552 void osc_wake_cache_waiters(struct client_obd *cli)
 553 {
 554         struct list_head *l, *tmp;
 555         struct osc_cache_waiter *ocw;
 556
 557         list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
 558                 /* if we can't dirty more, we must wait until some is written */
 559                 if (cli->cl_dirty + PAGE_SIZE > cli->cl_dirty_max) {
 560                         CDEBUG(D_CACHE, "no dirty room: dirty: %ld max %ld\n",
 561                                cli->cl_dirty, cli->cl_dirty_max);
 562                         return;
 563                 }
 564
 565                 /* if still dirty cache but no grant wait for pending RPCs that
 566                  * may yet return us some grant before doing sync writes */
 567                 if (cli->cl_w_in_flight && cli->cl_avail_grant < PAGE_SIZE) {
 568                         CDEBUG(D_CACHE, "%u BRW writes in flight, no grant\n",
 569                                cli->cl_w_in_flight);
 570                 }
 571                 ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
 572                 list_del_init(&ocw->ocw_entry);
 573                 if (cli->cl_avail_grant < PAGE_SIZE) {
 574                         /* no more RPCs in flight to return grant, do sync IO */
 575                         ocw->ocw_rc = -EDQUOT;
 576                         CDEBUG(D_INODE, "wake oap %p for sync\n", ocw->ocw_oap);
 577                 } else {
 578                         osc_consume_write_grant(cli, ocw->ocw_oap);
 579                 }
 580
 581                 wake_up(&ocw->ocw_waitq);
 582         }
 583
 584         EXIT;
 585 }
 586
 587 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
 588 {
 589         spin_lock(&cli->cl_loi_list_lock);
 590         CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
 591         cli->cl_avail_grant += body->oa.o_grant;
 592         /* waiters are woken in brw_interpret_oap */
 593         spin_unlock(&cli->cl_loi_list_lock);
 594 }
 595
 596 /* We assume that the reason this OSC got a short read is because it read
 597  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
 598  * via the LOV, and it _knows_ it's reading inside the file, it's just that
 599  * this stripe never got written at or beyond this stripe offset yet. */
 600 static void handle_short_read(int nob_read, obd_count page_count,
 601                               struct brw_page *pga)
 602 {
 603         char *ptr;
 604
 605         /* skip bytes read OK */
 606         while (nob_read > 0) {
 607                 LASSERT (page_count > 0);
 608
 609                 if (pga->count > nob_read) {
 610                         /* EOF inside this page */
 611                         ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK);
 612                         memset(ptr + nob_read, 0, pga->count - nob_read);
 613                         kunmap(pga->pg);
 614                         page_count--;
 615                         pga++;
 616                         break;
 617                 }
 618
 619                 nob_read -= pga->count;
 620                 page_count--;
 621                 pga++;
 622         }
 623
 624         /* zero remaining pages */
 625         while (page_count-- > 0) {
 626                 ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK);
 627                 memset(ptr, 0, pga->count);
 628                 kunmap(pga->pg);
 629                 pga++;
 630         }
 631 }
 632
 633 static int check_write_rcs(struct ptlrpc_request *request,
 634                            int requested_nob, int niocount,
 635                            obd_count page_count, struct brw_page *pga)
 636 {
 637         int    *remote_rcs, i;
 638
 639         /* return error if any niobuf was in error */
 640         remote_rcs = lustre_swab_repbuf(request, 1,
 641                                         sizeof(*remote_rcs) * niocount, NULL);
 642         if (remote_rcs == NULL) {
 643                 CERROR("Missing/short RC vector on BRW_WRITE reply\n");
 644                 return(-EPROTO);
 645         }
 646         if (lustre_msg_swabbed(request->rq_repmsg))
 647                 for (i = 0; i < niocount; i++)
 648                         __swab32s(&remote_rcs[i]);
 649
 650         for (i = 0; i < niocount; i++) {
 651                 if (remote_rcs[i] < 0)
 652                         return(remote_rcs[i]);
 653
 654                 if (remote_rcs[i] != 0) {
 655                         CERROR("rc[%d] invalid (%d) req %p\n",
 656                                 i, remote_rcs[i], request);
 657                         return(-EPROTO);
 658                 }
 659         }
 660
 661         if (request->rq_bulk->bd_nob_transferred != requested_nob) {
 662                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
 663                        requested_nob, request->rq_bulk->bd_nob_transferred);
 664                 return(-EPROTO);
 665         }
 666
 667         return (0);
 668 }
 669
 670 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
 671 {
 672         if (p1->flag != p2->flag) {
 673                 unsigned mask = ~OBD_BRW_FROM_GRANT;
 674
 675                 /* warn if we try to combine flags that we don't know to be
 676                  * safe to combine */
 677                 if ((p1->flag & mask) != (p2->flag & mask))
 678                         CERROR("is it ok to have flags 0x%x and 0x%x in the "
 679                                "same brw?\n", p1->flag, p2->flag);
 680                 return 0;
 681         }
 682
 683         return (p1->disk_offset + p1->count == p2->disk_offset);
 684 }
 685
 686 #if CHECKSUM_BULK
 687 static obd_count cksum_pages(int nob, obd_count page_count,
 688                              struct brw_page *pga)
 689 {
 690         obd_count cksum = 0;
 691         char *ptr;
 692
 693         while (nob > 0) {
 694                 LASSERT (page_count > 0);
 695
 696                 ptr = kmap(pga->pg);
 697                 ost_checksum(&cksum, ptr + (pga->off & (PAGE_SIZE - 1)),
 698                              pga->count > nob ? nob : pga->count);
 699                 kunmap(pga->pg);
 700
 701                 nob -= pga->count;
 702                 page_count--;
 703                 pga++;
 704         }
 705
 706         return (cksum);
 707 }
 708 #endif
 709
 710 static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
 711                                 struct lov_stripe_md *lsm, obd_count page_count,
 712                                 struct brw_page *pga, int *requested_nobp,
 713                                 int *niocountp, struct ptlrpc_request **reqp)
 714 {
 715         struct ptlrpc_request   *req;
 716         struct ptlrpc_bulk_desc *desc;
 717         struct client_obd       *cli = &imp->imp_obd->u.cli;
 718         struct ost_body         *body;
 719         struct obd_ioobj        *ioobj;
 720         struct niobuf_remote    *niobuf;
 721         int                      niocount;
 722         int                      size[3];
 723         int                      i;
 724         int                      requested_nob;
 725         int                      opc;
 726         int                      rc;
 727
 728         opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ;
 729
 730         for (niocount = i = 1; i < page_count; i++)
 731                 if (!can_merge_pages(&pga[i - 1], &pga[i]))
 732                         niocount++;
 733
 734         size[0] = sizeof(*body);
 735         size[1] = sizeof(*ioobj);
 736         size[2] = niocount * sizeof(*niobuf);
 737
 738         req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, opc, 3, size, NULL);
 739         if (req == NULL)
 740                 return (-ENOMEM);
 741
 742         if (opc == OST_WRITE)
 743                 desc = ptlrpc_prep_bulk_imp (req, page_count,
 744                                              BULK_GET_SOURCE, OST_BULK_PORTAL);
 745         else
 746                 desc = ptlrpc_prep_bulk_imp (req, page_count,
 747                                              BULK_PUT_SINK, OST_BULK_PORTAL);
 748         if (desc == NULL)
 749                 GOTO(out, rc = -ENOMEM);
 750         /* NB request now owns desc and will free it when it gets freed */
 751
 752         body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
 753         ioobj = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*ioobj));
 754         niobuf = lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf));
 755
 756         memcpy(&body->oa, oa, sizeof(*oa));
 757
 758         obdo_to_ioobj(oa, ioobj);
 759         ioobj->ioo_bufcnt = niocount;
 760
 761         LASSERT (page_count > 0);
 762
 763         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
 764                 struct brw_page *pg = &pga[i];
 765                 struct brw_page *pg_prev = pg - 1;
 766
 767                 LASSERT(pg->count > 0);
 768                 LASSERTF((pg->page_offset & ~PAGE_MASK)+ pg->count <= PAGE_SIZE,
 769                          "i: %d pg: %p pg_off: "LPU64", count: %u\n", i, pg,
 770                          pg->page_offset, pg->count);
 771                 LASSERTF(i == 0 || pg->disk_offset > pg_prev->disk_offset,
 772                          "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
 773                          " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
 774                          i, page_count,
 775                          pg->pg, pg->pg->private, pg->pg->index, pg->disk_offset,
 776                          pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
 777                          pg_prev->disk_offset);
 778
 779                 ptlrpc_prep_bulk_page(desc, pg->pg,
 780                                       pg->page_offset & ~PAGE_MASK, pg->count);
 781                 requested_nob += pg->count;
 782
 783                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
 784                         niobuf--;
 785                         niobuf->len += pg->count;
 786                 } else {
 787                         niobuf->offset = pg->disk_offset;
 788                         niobuf->len    = pg->count;
 789                         niobuf->flags  = pg->flag;
 790                 }
 791         }
 792
 793         LASSERT((void *)(niobuf - niocount) ==
 794                 lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)));
 795         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
 796
 797         /* size[0] still sizeof (*body) */
 798         if (opc == OST_WRITE) {
 799 #if CHECKSUM_BULK
 800                 body->oa.o_valid |= OBD_MD_FLCKSUM;
 801                 body->oa.o_cksum = cksum_pages(requested_nob, page_count, pga);
 802 #endif
 803                 /* 1 RC per niobuf */
 804                 size[1] = sizeof(__u32) * niocount;
 805                 req->rq_replen = lustre_msg_size(2, size);
 806         } else {
 807                 /* 1 RC for the whole I/O */
 808                 req->rq_replen = lustre_msg_size(1, size);
 809         }
 810
 811         *niocountp = niocount;
 812         *requested_nobp = requested_nob;
 813         *reqp = req;
 814         return (0);
 815
 816  out:
 817         ptlrpc_req_finished (req);
 818         return (rc);
 819 }
 820
 821 static int osc_brw_fini_request(struct ptlrpc_request *req, struct obdo *oa,
 822                                 int requested_nob, int niocount,
 823                                 obd_count page_count, struct brw_page *pga,
 824                                 int rc)
 825 {
 826         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 827         struct ost_body *body;
 828         ENTRY;
 829
 830         if (rc < 0)
 831                 RETURN(rc);
 832
 833         body = lustre_swab_repbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
 834         if (body == NULL) {
 835                 CERROR ("Can't unpack body\n");
 836                 RETURN(-EPROTO);
 837         }
 838
 839         osc_update_grant(cli, body);
 840         memcpy(oa, &body->oa, sizeof(*oa));
 841
 842         if (req->rq_reqmsg->opc == OST_WRITE) {
 843                 if (rc > 0) {
 844                         CERROR ("Unexpected +ve rc %d\n", rc);
 845                         RETURN(-EPROTO);
 846                 }
 847                 LASSERT (req->rq_bulk->bd_nob == requested_nob);
 848
 849                 RETURN(check_write_rcs(req, requested_nob, niocount,
 850                                        page_count, pga));
 851         }
 852
 853         if (rc > requested_nob) {
 854                 CERROR("Unexpected rc %d (%d requested)\n", rc, requested_nob);
 855                 RETURN(-EPROTO);
 856         }
 857
 858         if (rc != req->rq_bulk->bd_nob_transferred) {
 859                 CERROR ("Unexpected rc %d (%d transferred)\n",
 860                         rc, req->rq_bulk->bd_nob_transferred);
 861                 return (-EPROTO);
 862         }
 863
 864         if (rc < requested_nob)
 865                 handle_short_read(rc, page_count, pga);
 866
 867 #if CHECKSUM_BULK
 868         if (oa->o_valid & OBD_MD_FLCKSUM) {
 869                 const struct ptlrpc_peer *peer =
 870                         &req->rq_import->imp_connection->c_peer;
 871                 static int cksum_counter;
 872                 obd_count server_cksum = oa->o_cksum;
 873                 obd_count cksum = cksum_pages(rc, page_count, pga);
 874                 char str[PTL_NALFMT_SIZE];
 875
 876                 ptlrpc_peernid2str(peer, str);
 877
 878                 cksum_counter++;
 879                 if (server_cksum != cksum) {
 880                         CERROR("Bad checksum: server %x, client %x, server NID "
 881                                LPX64" (%s)\n", server_cksum, cksum,
 882                                peer->peer_id.nid, str);
 883                         cksum_counter = 0;
 884                         oa->o_cksum = cksum;
 885                 } else if ((cksum_counter & (-cksum_counter)) == cksum_counter){
 886                         CWARN("Checksum %u from "LPX64" (%s) OK: %x\n",
 887                               cksum_counter, peer->peer_id.nid, str, cksum);
 888                 }
 889         } else {
 890                 static int cksum_missed;
 891
 892                 cksum_missed++;
 893                 if ((cksum_missed & (-cksum_missed)) == cksum_missed)
 894                         CERROR("Request checksum %u from "LPX64", no reply\n",
 895                                cksum_missed,
 896                                req->rq_import->imp_connection->c_peer.peer_id.nid);
 897         }
 898 #endif
 899         RETURN(0);
 900 }
 901
 902 static int osc_brw_internal(int cmd, struct obd_export *exp,struct obdo *oa,
 903                             struct lov_stripe_md *lsm,
 904                             obd_count page_count, struct brw_page *pga)
 905 {
 906         int                    requested_nob;
 907         int                    niocount;
 908         struct ptlrpc_request *request;
 909         int                    rc;
 910         ENTRY;
 911
 912 restart_bulk:
 913         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
 914                                   page_count, pga, &requested_nob, &niocount,
 915                                   &request);
 916         if (rc != 0)
 917                 return (rc);
 918
 919         rc = ptlrpc_queue_wait(request);
 920
 921         if (rc == -ETIMEDOUT && request->rq_resend) {
 922                 DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
 923                 ptlrpc_req_finished(request);
 924                 goto restart_bulk;
 925         }
 926
 927         rc = osc_brw_fini_request(request, oa, requested_nob, niocount,
 928                                   page_count, pga, rc);
 929
 930         ptlrpc_req_finished(request);
 931         RETURN (rc);
 932 }
 933
 934 static int brw_interpret(struct ptlrpc_request *request,
 935                          struct osc_brw_async_args *aa, int rc)
 936 {
 937         struct obdo *oa      = aa->aa_oa;
 938         int requested_nob    = aa->aa_requested_nob;
 939         int niocount         = aa->aa_nio_count;
 940         obd_count page_count = aa->aa_page_count;
 941         struct brw_page *pga = aa->aa_pga;
 942         ENTRY;
 943
 944         rc = osc_brw_fini_request(request, oa, requested_nob, niocount,
 945                                   page_count, pga, rc);
 946         RETURN (rc);
 947 }
 948
 949 static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
 950                           struct lov_stripe_md *lsm, obd_count page_count,
 951                           struct brw_page *pga, struct ptlrpc_request_set *set)
 952 {
 953         struct ptlrpc_request     *request;
 954         int                        requested_nob;
 955         int                        nio_count;
 956         struct osc_brw_async_args *aa;
 957         int                        rc;
 958         ENTRY;
 959
 960         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
 961                                   page_count, pga, &requested_nob, &nio_count,
 962                                   &request);
 963         if (rc == 0) {
 964                 LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
 965                 aa = (struct osc_brw_async_args *)&request->rq_async_args;
 966                 aa->aa_oa = oa;
 967                 aa->aa_requested_nob = requested_nob;
 968                 aa->aa_nio_count = nio_count;
 969                 aa->aa_page_count = page_count;
 970                 aa->aa_pga = pga;
 971
 972                 request->rq_interpret_reply = brw_interpret;
 973                 ptlrpc_set_add_req(set, request);
 974         }
 975         RETURN (rc);
 976 }
 977
 978 #ifndef min_t
 979 #define min_t(type,x,y) \
 980         ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 981 #endif
 982
 983 /*
 984  * ugh, we want disk allocation on the target to happen in offset order.  we'll
 985  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
 986  * fine for our small page arrays and doesn't require allocation.  its an
 987  * insertion sort that swaps elements that are strides apart, shrinking the
 988  * stride down until its '1' and the array is sorted.
 989  */
 990 static void sort_brw_pages(struct brw_page *array, int num)
 991 {
 992         int stride, i, j;
 993         struct brw_page tmp;
 994
 995         if (num == 1)
 996                 return;
 997         for (stride = 1; stride < num ; stride = (stride * 3) + 1)
 998                 ;
 999
1000         do {
1001                 stride /= 3;
1002                 for (i = stride ; i < num ; i++) {
1003                         tmp = array[i];
1004                         j = i;
1005                         while (j >= stride && array[j - stride].disk_offset >
1006                                 tmp.disk_offset) {
1007                                 array[j] = array[j - stride];
1008                                 j -= stride;
1009                         }
1010                         array[j] = tmp;
1011                 }
1012         } while (stride > 1);
1013 }
1014
1015 /* make sure we the regions we're passing to elan don't violate its '4
1016  * fragments' constraint.  portal headers are a fragment, all full
1017  * PAGE_SIZE long pages count as 1 fragment, and each partial page
1018  * counts as a fragment.  I think.  see bug 934. */
1019 static obd_count check_elan_limit(struct brw_page *pg, obd_count pages)
1020 {
1021         int frags_left = 3;
1022         int saw_whole_frag = 0;
1023         int i;
1024
1025         for (i = 0 ; frags_left && i < pages ; pg++, i++) {
1026                 if (pg->count == PAGE_SIZE) {
1027                         if (!saw_whole_frag) {
1028                                 saw_whole_frag = 1;
1029                                 frags_left--;
1030                         }
1031                 } else {
1032                         frags_left--;
1033                 }
1034         }
1035         return i;
1036 }
1037
1038 static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
1039                    struct lov_stripe_md *md, obd_count page_count,
1040                    struct brw_page *pga, struct obd_trans_info *oti)
1041 {
1042         ENTRY;
1043
1044         if (cmd == OBD_BRW_CHECK) {
1045                 /* The caller just wants to know if there's a chance that this
1046                  * I/O can succeed */
1047                 struct obd_import *imp = class_exp2cliimp(exp);
1048
1049                 if (imp == NULL || imp->imp_invalid)
1050                         RETURN(-EIO);
1051                 RETURN(0);
1052         }
1053
1054         while (page_count) {
1055                 obd_count pages_per_brw;
1056                 int rc;
1057
1058                 if (page_count > PTLRPC_MAX_BRW_PAGES)
1059                         pages_per_brw = PTLRPC_MAX_BRW_PAGES;
1060                 else
1061                         pages_per_brw = page_count;
1062
1063                 sort_brw_pages(pga, pages_per_brw);
1064                 pages_per_brw = check_elan_limit(pga, pages_per_brw);
1065
1066                 rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, pga);
1067
1068                 if (rc != 0)
1069                         RETURN(rc);
1070
1071                 page_count -= pages_per_brw;
1072                 pga += pages_per_brw;
1073         }
1074         RETURN(0);
1075 }
1076
1077 static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
1078                          struct lov_stripe_md *md, obd_count page_count,
1079                          struct brw_page *pga, struct ptlrpc_request_set *set,
1080                          struct obd_trans_info *oti)
1081 {
1082         ENTRY;
1083
1084         if (cmd == OBD_BRW_CHECK) {
1085                 /* The caller just wants to know if there's a chance that this
1086                  * I/O can succeed */
1087                 struct obd_import *imp = class_exp2cliimp(exp);
1088
1089                 if (imp == NULL || imp->imp_invalid)
1090                         RETURN(-EIO);
1091                 RETURN(0);
1092         }
1093
1094         while (page_count) {
1095                 obd_count pages_per_brw;
1096                 int rc;
1097
1098                 if (page_count > PTLRPC_MAX_BRW_PAGES)
1099                         pages_per_brw = PTLRPC_MAX_BRW_PAGES;
1100                 else
1101                         pages_per_brw = page_count;
1102
1103                 sort_brw_pages(pga, pages_per_brw);
1104                 pages_per_brw = check_elan_limit(pga, pages_per_brw);
1105
1106                 rc = async_internal(cmd, exp, oa, md, pages_per_brw, pga, set);
1107
1108                 if (rc != 0)
1109                         RETURN(rc);
1110
1111                 page_count -= pages_per_brw;
1112                 pga += pages_per_brw;
1113         }
1114         RETURN(0);
1115 }
1116
1117 static void osc_check_rpcs(struct client_obd *cli);
1118 static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap,
1119                            int sent);
1120 static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi);
1121 static void lop_update_pending(struct client_obd *cli,
1122                                struct loi_oap_pages *lop, int cmd, int delta);
1123
1124 /* this is called when a sync waiter receives an interruption.  Its job is to
1125  * get the caller woken as soon as possible.  If its page hasn't been put in an
1126  * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
1127  * desiring interruption which will forcefully complete the rpc once the rpc
1128  * has timed out */
1129 static void osc_occ_interrupted(struct oig_callback_context *occ)
1130 {
1131         struct osc_async_page *oap;
1132         struct loi_oap_pages *lop;
1133         struct lov_oinfo *loi;
1134         ENTRY;
1135
1136         /* XXX member_of() */
1137         oap = list_entry(occ, struct osc_async_page, oap_occ);
1138
1139         spin_lock(&oap->oap_cli->cl_loi_list_lock);
1140
1141         oap->oap_interrupted = 1;
1142
1143         /* ok, it's been put in an rpc. */
1144         if (oap->oap_request != NULL) {
1145                 ptlrpc_mark_interrupted(oap->oap_request);
1146                 ptlrpcd_wake(oap->oap_request);
1147                 GOTO(unlock, 0);
1148         }
1149
1150         /* we don't get interruption callbacks until osc_trigger_sync_io()
1151          * has been called and put the sync oaps in the pending/urgent lists.*/
1152         if (!list_empty(&oap->oap_pending_item)) {
1153                 list_del_init(&oap->oap_pending_item);
1154                 if (oap->oap_async_flags & ASYNC_URGENT)
1155                         list_del_init(&oap->oap_urgent_item);
1156
1157                 loi = oap->oap_loi;
1158                 lop = (oap->oap_cmd == OBD_BRW_WRITE) ?
1159                         &loi->loi_write_lop : &loi->loi_read_lop;
1160                 lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1);
1161                 loi_list_maint(oap->oap_cli, oap->oap_loi);
1162
1163                 oig_complete_one(oap->oap_oig, &oap->oap_occ, 0);
1164                 oap->oap_oig = NULL;
1165         }
1166
1167 unlock:
1168         spin_unlock(&oap->oap_cli->cl_loi_list_lock);
1169 }
1170
1171 /* this must be called holding the loi list lock to give coverage to exit_cache,
1172  * async_flag maintenance, and oap_request */
1173 static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
1174                               struct osc_async_page *oap, int sent, int rc)
1175 {
1176         osc_exit_cache(cli, oap, sent);
1177         oap->oap_async_flags = 0;
1178         oap->oap_interrupted = 0;
1179
1180         if (oap->oap_request != NULL) {
1181                 ptlrpc_req_finished(oap->oap_request);
1182                 oap->oap_request = NULL;
1183         }
1184
1185         if (rc == 0 && oa != NULL)
1186                 oap->oap_loi->loi_blocks = oa->o_blocks;
1187
1188         if (oap->oap_oig) {
1189                 oig_complete_one(oap->oap_oig, &oap->oap_occ, rc);
1190                 oap->oap_oig = NULL;
1191                 EXIT;
1192                 return;
1193         }
1194
1195         oap->oap_caller_ops->ap_completion(oap->oap_caller_data, oap->oap_cmd,
1196                                            oa, rc);
1197 }
1198
1199 static int brw_interpret_oap(struct ptlrpc_request *request,
1200                              struct osc_brw_async_args *aa, int rc)
1201 {
1202         struct osc_async_page *oap;
1203         struct client_obd *cli;
1204         struct list_head *pos, *n;
1205         struct timeval now;
1206         ENTRY;
1207
1208         do_gettimeofday(&now);
1209         rc = osc_brw_fini_request(request, aa->aa_oa, aa->aa_requested_nob,
1210                                   aa->aa_nio_count, aa->aa_page_count,
1211                                   aa->aa_pga, rc);
1212
1213         CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc);
1214
1215         cli = aa->aa_cli;
1216         /* in failout recovery we ignore writeback failure and want
1217          * to just tell llite to unlock the page and continue */
1218         if (request->rq_reqmsg->opc == OST_WRITE &&
1219             (cli->cl_import == NULL || cli->cl_import->imp_invalid)) {
1220                 CDEBUG(D_INODE, "flipping to rc 0 imp %p inv %d\n",
1221                        cli->cl_import,
1222                        cli->cl_import ? cli->cl_import->imp_invalid : -1);
1223                 rc = 0;
1224         }
1225
1226         spin_lock(&cli->cl_loi_list_lock);
1227
1228         if (request->rq_reqmsg->opc == OST_WRITE)
1229                 lprocfs_stime_record(&cli->cl_write_stime, &now,
1230                                      &request->rq_rpcd_start);
1231         else
1232                 lprocfs_stime_record(&cli->cl_read_stime, &now,
1233                                      &request->rq_rpcd_start);
1234
1235
1236
1237         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
1238          * is called so we know whether to go to sync BRWs or wait for more
1239          * RPCs to complete */
1240         if (request->rq_reqmsg->opc == OST_WRITE)
1241                 cli->cl_w_in_flight--;
1242         else
1243                 cli->cl_r_in_flight--;
1244
1245         /* the caller may re-use the oap after the completion call so
1246          * we need to clean it up a little */
1247         list_for_each_safe(pos, n, &aa->aa_oaps) {
1248                 oap = list_entry(pos, struct osc_async_page, oap_rpc_item);
1249
1250                 //CDEBUG(D_INODE, "page %p index %lu oap %p\n",
1251                        //oap->oap_page, oap->oap_page->index, oap);
1252
1253                 list_del_init(&oap->oap_rpc_item);
1254                 osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
1255         }
1256
1257         osc_wake_cache_waiters(cli);
1258         osc_check_rpcs(cli);
1259
1260         spin_unlock(&cli->cl_loi_list_lock);
1261
1262         obdo_free(aa->aa_oa);
1263         OBD_FREE(aa->aa_pga, aa->aa_page_count * sizeof(struct brw_page));
1264
1265         RETURN(0);
1266 }
1267
1268 static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
1269                                             struct list_head *rpc_list,
1270                                             int page_count, int cmd)
1271 {
1272         struct ptlrpc_request *req;
1273         struct brw_page *pga = NULL;
1274         int requested_nob, nio_count;
1275         struct osc_brw_async_args *aa;
1276         struct obdo *oa = NULL;
1277         struct obd_async_page_ops *ops = NULL;
1278         void *caller_data = NULL;
1279         struct list_head *pos;
1280         int i, rc;
1281
1282         LASSERT(!list_empty(rpc_list));
1283
1284         OBD_ALLOC(pga, sizeof(*pga) * page_count);
1285         if (pga == NULL)
1286                 RETURN(ERR_PTR(-ENOMEM));
1287
1288         oa = obdo_alloc();
1289         if (oa == NULL)
1290                 GOTO(out, req = ERR_PTR(-ENOMEM));
1291
1292         i = 0;
1293         list_for_each(pos, rpc_list) {
1294                 struct osc_async_page *oap;
1295
1296                 oap = list_entry(pos, struct osc_async_page, oap_rpc_item);
1297                 if (ops == NULL) {
1298                         ops = oap->oap_caller_ops;
1299                         caller_data = oap->oap_caller_data;
1300                 }
1301                 pga[i].disk_offset = oap->oap_obj_off + oap->oap_page_off;
1302                 pga[i].page_offset = pga[i].disk_offset;
1303                 pga[i].pg = oap->oap_page;
1304                 pga[i].count = oap->oap_count;
1305                 pga[i].flag = oap->oap_brw_flags;
1306                 CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
1307                        pga[i].pg, oap->oap_page->index, oap, pga[i].flag);
1308                 i++;
1309         }
1310
1311         /* always get the data for the obdo for the rpc */
1312         LASSERT(ops != NULL);
1313         ops->ap_fill_obdo(caller_data, cmd, oa);
1314
1315         sort_brw_pages(pga, page_count);
1316         rc = osc_brw_prep_request(cmd, cli->cl_import, oa, NULL, page_count,
1317                                   pga, &requested_nob, &nio_count, &req);
1318         if (rc != 0) {
1319                 CERROR("prep_req failed: %d\n", rc);
1320                 GOTO(out, req = ERR_PTR(rc));
1321         }
1322
1323         LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
1324         aa = (struct osc_brw_async_args *)&req->rq_async_args;
1325         aa->aa_oa = oa;
1326         aa->aa_requested_nob = requested_nob;
1327         aa->aa_nio_count = nio_count;
1328         aa->aa_page_count = page_count;
1329         aa->aa_pga = pga;
1330         aa->aa_cli = cli;
1331
1332 out:
1333         if (IS_ERR(req)) {
1334                 if (oa)
1335                         obdo_free(oa);
1336                 if (pga)
1337                         OBD_FREE(pga, sizeof(*pga) * page_count);
1338         }
1339         RETURN(req);
1340 }
1341
1342 static void lop_update_pending(struct client_obd *cli,
1343                                struct loi_oap_pages *lop, int cmd, int delta)
1344 {
1345         lop->lop_num_pending += delta;
1346         if (cmd == OBD_BRW_WRITE)
1347                 cli->cl_pending_w_pages += delta;
1348         else
1349                 cli->cl_pending_r_pages += delta;
1350 }
1351
1352 /* the loi lock is held across this function but it's allowed to release
1353  * and reacquire it during its work */
1354 static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
1355                             int cmd, struct loi_oap_pages *lop)
1356 {
1357         struct ptlrpc_request *request;
1358         obd_count page_count = 0;
1359         struct list_head *tmp, *pos;
1360         struct osc_async_page *oap = NULL;
1361         struct osc_brw_async_args *aa;
1362         struct obd_async_page_ops *ops;
1363         LIST_HEAD(rpc_list);
1364         ENTRY;
1365
1366         /* first we find the pages we're allowed to work with */
1367         list_for_each_safe(pos, tmp, &lop->lop_pending) {
1368                 oap = list_entry(pos, struct osc_async_page, oap_pending_item);
1369                 ops = oap->oap_caller_ops;
1370
1371                 LASSERT(oap->oap_magic == OAP_MAGIC);
1372
1373                 /* in llite being 'ready' equates to the page being locked
1374                  * until completion unlocks it.  commit_write submits a page
1375                  * as not ready because its unlock will happen unconditionally
1376                  * as the call returns.  if we race with commit_write giving
1377                  * us that page we dont' want to create a hole in the page
1378                  * stream, so we stop and leave the rpc to be fired by
1379                  * another dirtier or kupdated interval (the not ready page
1380                  * will still be on the dirty list).  we could call in
1381                  * at the end of ll_file_write to process the queue again. */
1382                 if (!(oap->oap_async_flags & ASYNC_READY)) {
1383                         int rc = ops->ap_make_ready(oap->oap_caller_data, cmd);
1384                         if (rc < 0)
1385                                 CDEBUG(D_INODE, "oap %p page %p returned %d "
1386                                                 "instead of ready\n", oap,
1387                                                 oap->oap_page, rc);
1388                         switch (rc) {
1389                         case -EAGAIN:
1390                                 /* llite is telling us that the page is still
1391                                  * in commit_write and that we should try
1392                                  * and put it in an rpc again later.  we
1393                                  * break out of the loop so we don't create
1394                                  * a hole in the sequence of pages in the rpc
1395                                  * stream.*/
1396                                 pos = NULL;
1397                                 break;
1398                         case -EINTR:
1399                                 /* the io isn't needed.. tell the checks
1400                                  * below to complete the rpc with EINTR */
1401                                 oap->oap_async_flags |= ASYNC_COUNT_STABLE;
1402                                 oap->oap_count = -EINTR;
1403                                 break;
1404                         case 0:
1405                                 oap->oap_async_flags |= ASYNC_READY;
1406                                 break;
1407                         default:
1408                                 LASSERTF(0, "oap %p page %p returned %d "
1409                                             "from make_ready\n", oap,
1410                                             oap->oap_page, rc);
1411                                 break;
1412                         }
1413                 }
1414                 if (pos == NULL)
1415                         break;
1416
1417                 /* take the page out of our book-keeping */
1418                 list_del_init(&oap->oap_pending_item);
1419                 lop_update_pending(cli, lop, cmd, -1);
1420                 list_del_init(&oap->oap_urgent_item);
1421
1422                 /* ask the caller for the size of the io as the rpc leaves. */
1423                 if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE))
1424                         oap->oap_count =
1425                                 ops->ap_refresh_count(oap->oap_caller_data,cmd);
1426                 if (oap->oap_count <= 0) {
1427                         CDEBUG(D_CACHE, "oap %p count %d, completing\n", oap,
1428                                oap->oap_count);
1429                         osc_ap_completion(cli, NULL, oap, 0, oap->oap_count);
1430                         continue;
1431                 }
1432
1433                 /* now put the page back in our accounting */
1434                 list_add_tail(&oap->oap_rpc_item, &rpc_list);
1435                 if (++page_count >= cli->cl_max_pages_per_rpc)
1436                         break;
1437         }
1438
1439         osc_wake_cache_waiters(cli);
1440
1441         if (page_count == 0)
1442                 RETURN(0);
1443
1444         loi_list_maint(cli, loi);
1445         spin_unlock(&cli->cl_loi_list_lock);
1446
1447         request = osc_build_req(cli, &rpc_list, page_count, cmd);
1448         if (IS_ERR(request)) {
1449                 /* this should happen rarely and is pretty bad, it makes the
1450                  * pending list not follow the dirty order */
1451                 spin_lock(&cli->cl_loi_list_lock);
1452                 list_for_each_safe(pos, tmp, &rpc_list) {
1453                         oap = list_entry(pos, struct osc_async_page,
1454                                          oap_rpc_item);
1455                         list_del_init(&oap->oap_rpc_item);
1456
1457                         /* queued sync pages can be torn down while the pages
1458                          * were between the pending list and the rpc */
1459                         if (oap->oap_interrupted) {
1460                                 CDEBUG(D_INODE, "oap %p interrupted\n", oap);
1461                                 osc_ap_completion(cli, NULL, oap, 0,
1462                                                   oap->oap_count);
1463                                 continue;
1464                         }
1465
1466                         /* put the page back in the loi/lop lists */
1467                         list_add_tail(&oap->oap_pending_item,
1468                                       &lop->lop_pending);
1469                         lop_update_pending(cli, lop, cmd, 1);
1470                         if (oap->oap_async_flags & ASYNC_URGENT)
1471                                 list_add(&oap->oap_urgent_item,
1472                                          &lop->lop_urgent);
1473                 }
1474                 loi_list_maint(cli, loi);
1475                 RETURN(PTR_ERR(request));
1476         }
1477
1478         LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
1479         aa = (struct osc_brw_async_args *)&request->rq_async_args;
1480         INIT_LIST_HEAD(&aa->aa_oaps);
1481         list_splice(&rpc_list, &aa->aa_oaps);
1482         INIT_LIST_HEAD(&rpc_list);
1483
1484 #ifdef __KERNEL__
1485         if (cmd == OBD_BRW_READ) {
1486                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
1487                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
1488         } else {
1489                 lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
1490                 lprocfs_oh_tally(&cli->cl_write_rpc_hist,
1491                                  cli->cl_w_in_flight);
1492         }
1493 #endif
1494
1495         spin_lock(&cli->cl_loi_list_lock);
1496
1497         if (cmd == OBD_BRW_READ)
1498                 cli->cl_r_in_flight++;
1499         else
1500                 cli->cl_w_in_flight++;
1501         /* queued sync pages can be torn down while the pages
1502          * were between the pending list and the rpc */
1503         list_for_each(pos, &aa->aa_oaps) {
1504                 oap = list_entry(pos, struct osc_async_page, oap_rpc_item);
1505                 if (oap->oap_interrupted) {
1506                         CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
1507                                oap, request);
1508                         ptlrpc_mark_interrupted(request);
1509                         break;
1510                 }
1511         }
1512
1513         CDEBUG(D_INODE, "req %p: %d pages, aa %p.  now %dr/%dw in flight\n",
1514                         request, page_count, aa, cli->cl_r_in_flight,
1515                         cli->cl_w_in_flight);
1516
1517         oap->oap_request = ptlrpc_request_addref(request);
1518         request->rq_interpret_reply = brw_interpret_oap;
1519         ptlrpcd_add_req(request);
1520         RETURN(1);
1521 }
1522
1523 static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
1524                          int cmd)
1525 {
1526         int optimal;
1527         ENTRY;
1528
1529         if (lop->lop_num_pending == 0)
1530                 RETURN(0);
1531
1532         /* if we have an invalid import we want to drain the queued pages
1533          * by forcing them through rpcs that immediately fail and complete
1534          * the pages.  recovery relies on this to empty the queued pages
1535          * before canceling the locks and evicting down the llite pages */
1536         if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
1537                 RETURN(1);
1538
1539         /* stream rpcs in queue order as long as as there is an urgent page
1540          * queued.  this is our cheap solution for good batching in the case
1541          * where writepage marks some random page in the middle of the file as
1542          * urgent because of, say, memory pressure */
1543         if (!list_empty(&lop->lop_urgent))
1544                 RETURN(1);
1545
1546         /* fire off rpcs when we have 'optimal' rpcs as tuned for the wire. */
1547         optimal = cli->cl_max_pages_per_rpc;
1548         if (cmd == OBD_BRW_WRITE) {
1549                 /* trigger a write rpc stream as long as there are dirtiers
1550                  * waiting for space.  as they're waiting, they're not going to
1551                  * create more pages to coallesce with what's waiting.. */
1552                 if (!list_empty(&cli->cl_cache_waiters))
1553                         RETURN(1);
1554
1555                 /* *2 to avoid triggering rpcs that would want to include pages
1556                  * that are being queued but which can't be made ready until
1557                  * the queuer finishes with the page. this is a wart for
1558                  * llite::commit_write() */
1559                 optimal += 16;
1560         }
1561         if (lop->lop_num_pending >= optimal)
1562                 RETURN(1);
1563
1564         RETURN(0);
1565 }
1566
1567 static void on_list(struct list_head *item, struct list_head *list,
1568                     int should_be_on)
1569 {
1570         if (list_empty(item) && should_be_on)
1571                 list_add_tail(item, list);
1572         else if (!list_empty(item) && !should_be_on)
1573                 list_del_init(item);
1574 }
1575
1576 /* maintain the loi's cli list membership invariants so that osc_send_oap_rpc
1577  * can find pages to build into rpcs quickly */
1578 static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi)
1579 {
1580         on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list,
1581                 lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) ||
1582                 lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ));
1583
1584         on_list(&loi->loi_write_item, &cli->cl_loi_write_list,
1585                 loi->loi_write_lop.lop_num_pending);
1586
1587         on_list(&loi->loi_read_item, &cli->cl_loi_read_list,
1588                 loi->loi_read_lop.lop_num_pending);
1589 }
1590
1591 #define LOI_DEBUG(LOI, STR, args...)                                     \
1592         CDEBUG(D_INODE, "loi ready %d wr %d:%d rd %d:%d " STR,           \
1593                !list_empty(&(LOI)->loi_cli_item),                        \
1594                (LOI)->loi_write_lop.lop_num_pending,                     \
1595                !list_empty(&(LOI)->loi_write_lop.lop_urgent),            \
1596                (LOI)->loi_read_lop.lop_num_pending,                      \
1597                !list_empty(&(LOI)->loi_read_lop.lop_urgent),             \
1598                args)                                                     \
1599
1600 struct lov_oinfo *osc_next_loi(struct client_obd *cli)
1601 {
1602         ENTRY;
1603         /* first return all objects which we already know to have
1604          * pages ready to be stuffed into rpcs */
1605         if (!list_empty(&cli->cl_loi_ready_list))
1606                 RETURN(list_entry(cli->cl_loi_ready_list.next,
1607                                   struct lov_oinfo, loi_cli_item));
1608
1609         /* then if we have cache waiters, return all objects with queued
1610          * writes.  This is especially important when many small files
1611          * have filled up the cache and not been fired into rpcs because
1612          * they don't pass the nr_pending/object threshhold */
1613         if (!list_empty(&cli->cl_cache_waiters) &&
1614             !list_empty(&cli->cl_loi_write_list))
1615                 RETURN(list_entry(cli->cl_loi_write_list.next,
1616                                   struct lov_oinfo, loi_write_item));
1617
1618         /* then return all queued objects when we have an invalid import
1619          * so that they get flushed */
1620         if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
1621                 if (!list_empty(&cli->cl_loi_write_list))
1622                         RETURN(list_entry(cli->cl_loi_write_list.next,
1623                                           struct lov_oinfo, loi_write_item));
1624                 if (!list_empty(&cli->cl_loi_read_list))
1625                         RETURN(list_entry(cli->cl_loi_read_list.next,
1626                                           struct lov_oinfo, loi_read_item));
1627         }
1628         RETURN(NULL);
1629 }
1630
1631 /* called with the loi list lock held */
1632 static void osc_check_rpcs(struct client_obd *cli)
1633 {
1634         struct lov_oinfo *loi;
1635         int rc = 0, race_counter = 0;
1636         ENTRY;
1637
1638         while ((loi = osc_next_loi(cli)) != NULL) {
1639                 LOI_DEBUG(loi, "%lu in flight\n", rpcs_in_flight(cli));
1640
1641                 if (rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight)
1642                         break;
1643
1644                 /* attempt some read/write balancing by alternating between
1645                  * reads and writes in an object.  The makes_rpc checks here
1646                  * would be redundant if we were getting read/write work items
1647                  * instead of objects.  we don't want send_oap_rpc to drain a
1648                  * partial read pending queue when we're given this object to
1649                  * do io on writes while there are cache waiters */
1650                 if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) {
1651                         rc = osc_send_oap_rpc(cli, loi, OBD_BRW_WRITE,
1652                                               &loi->loi_write_lop);
1653                         if (rc < 0)
1654                                 break;
1655                         if (rc > 0)
1656                                 race_counter = 0;
1657                         else
1658                                 race_counter++;
1659                 }
1660                 if (lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)) {
1661                         rc = osc_send_oap_rpc(cli, loi, OBD_BRW_READ,
1662                                               &loi->loi_read_lop);
1663                         if (rc < 0)
1664                                 break;
1665                         if (rc > 0)
1666                                 race_counter = 0;
1667                         else
1668                                 race_counter++;
1669                 }
1670
1671                 /* attempt some inter-object balancing by issueing rpcs
1672                  * for each object in turn */
1673                 if (!list_empty(&loi->loi_cli_item))
1674                         list_del_init(&loi->loi_cli_item);
1675                 if (!list_empty(&loi->loi_write_item))
1676                         list_del_init(&loi->loi_write_item);
1677                 if (!list_empty(&loi->loi_read_item))
1678                         list_del_init(&loi->loi_read_item);
1679
1680                 loi_list_maint(cli, loi);
1681
1682                 /* send_oap_rpc fails with 0 when make_ready tells it to
1683                  * back off.  llite's make_ready does this when it tries
1684                  * to lock a page queued for write that is already locked.
1685                  * we want to try sending rpcs from many objects, but we
1686                  * don't want to spin failing with 0.  */
1687                 if (race_counter == 10)
1688                         break;
1689         }
1690         EXIT;
1691 }
1692
1693 /* we're trying to queue a page in the osc so we're subject to the
1694  * 'cl_dirty_max' limit on the number of pages that can be queued in the osc.
1695  * If the osc's queued pages are already at that limit, then we want to sleep
1696  * until there is space in the osc's queue for us.  We also may be waiting for
1697  * write credits from the OST if there are RPCs in flight that may return some
1698  * before we fall back to sync writes.
1699  *
1700  * We need this know our allocation was granted in the presence of signals */
1701 static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
1702 {
1703         int rc;
1704         ENTRY;
1705         spin_lock(&cli->cl_loi_list_lock);
1706         rc = list_empty(&ocw->ocw_entry) || rpcs_in_flight(cli) == 0;
1707         spin_unlock(&cli->cl_loi_list_lock);
1708         RETURN(rc);
1709 };
1710
1711 /* Caller must hold loi_list_lock - we drop/regain it if we need to wait for
1712  * grant or cache space. */
1713 static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi,
1714                            struct osc_async_page *oap)
1715 {
1716         struct osc_cache_waiter ocw;
1717         struct l_wait_info lwi = { 0 };
1718         struct timeval start, stop;
1719
1720         CDEBUG(D_CACHE, "dirty: %ld dirty_max: %ld dropped: %lu grant: %lu\n",
1721                cli->cl_dirty, cli->cl_dirty_max, cli->cl_lost_grant,
1722                cli->cl_avail_grant);
1723
1724         if (cli->cl_dirty_max < PAGE_SIZE)
1725                 return(-EDQUOT);
1726
1727         /* Hopefully normal case - cache space and write credits available */
1728         if (cli->cl_dirty + PAGE_SIZE <= cli->cl_dirty_max &&
1729             cli->cl_avail_grant >= PAGE_SIZE) {
1730                 /* account for ourselves */
1731                 osc_consume_write_grant(cli, oap);
1732                 return(0);
1733         }
1734
1735         /* Make sure that there are write rpcs in flight to wait for.  This
1736          * is a little silly as this object may not have any pending but
1737          * other objects sure might. */
1738         if (cli->cl_w_in_flight) {
1739                 list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
1740                 init_waitqueue_head(&ocw.ocw_waitq);
1741                 ocw.ocw_oap = oap;
1742                 ocw.ocw_rc = 0;
1743
1744                 loi_list_maint(cli, loi);
1745                 osc_check_rpcs(cli);
1746                 spin_unlock(&cli->cl_loi_list_lock);
1747
1748                 CDEBUG(0, "sleeping for cache space\n");
1749                 do_gettimeofday(&start);
1750                 l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
1751                 do_gettimeofday(&stop);
1752                 spin_lock(&cli->cl_loi_list_lock);
1753                 lprocfs_stime_record(&cli->cl_enter_stime, &stop, &start);
1754                 if (!list_empty(&ocw.ocw_entry)) {
1755                         list_del(&ocw.ocw_entry);
1756                         RETURN(-EINTR);
1757                 }
1758                 RETURN(ocw.ocw_rc);
1759         }
1760
1761         RETURN(-EDQUOT);
1762 }
1763
1764 /* the companion to enter_cache, called when an oap is no longer part of the
1765  * dirty accounting.. so writeback completes or truncate happens before writing
1766  * starts.  must be called with the loi lock held. */
1767 static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap,
1768                            int sent)
1769 {
1770         ENTRY;
1771
1772         if (!(oap->oap_brw_flags & OBD_BRW_FROM_GRANT)) {
1773                 EXIT;
1774                 return;
1775         }
1776
1777         oap->oap_brw_flags &= ~OBD_BRW_FROM_GRANT;
1778         cli->cl_dirty -= PAGE_SIZE;
1779         if (!sent) {
1780                 cli->cl_lost_grant += PAGE_SIZE;
1781                 CDEBUG(D_CACHE, "lost grant: %lu avail grant: %lu dirty: %lu\n",
1782                        cli->cl_lost_grant, cli->cl_avail_grant, cli->cl_dirty);
1783         }
1784
1785         EXIT;
1786 }
1787
1788 int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
1789                         struct lov_oinfo *loi, struct page *page,
1790                         obd_off offset, struct obd_async_page_ops *ops,
1791                         void *data, void **res)
1792 {
1793         struct osc_async_page *oap;
1794         ENTRY;
1795
1796         OBD_ALLOC(oap, sizeof(*oap));
1797         if (oap == NULL)
1798                 return -ENOMEM;
1799
1800         oap->oap_magic = OAP_MAGIC;
1801         oap->oap_cli = &exp->exp_obd->u.cli;
1802         oap->oap_loi = loi;
1803
1804         oap->oap_caller_ops = ops;
1805         oap->oap_caller_data = data;
1806
1807         oap->oap_page = page;
1808         oap->oap_obj_off = offset;
1809
1810         INIT_LIST_HEAD(&oap->oap_pending_item);
1811         INIT_LIST_HEAD(&oap->oap_urgent_item);
1812         INIT_LIST_HEAD(&oap->oap_rpc_item);
1813
1814         oap->oap_occ.occ_interrupted = osc_occ_interrupted;
1815
1816         CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset);
1817         *res = oap;
1818         RETURN(0);
1819 }
1820
1821 struct osc_async_page *oap_from_cookie(void *cookie)
1822 {
1823         struct osc_async_page *oap = cookie;
1824         if (oap->oap_magic != OAP_MAGIC)
1825                 return ERR_PTR(-EINVAL);
1826         return oap;
1827 };
1828
1829 static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
1830                               struct lov_oinfo *loi, void *cookie,
1831                               int cmd, obd_off off, int count,
1832                               obd_flags brw_flags, enum async_flags async_flags)
1833 {
1834         struct client_obd *cli = &exp->exp_obd->u.cli;
1835         struct osc_async_page *oap;
1836         struct loi_oap_pages *lop;
1837         int rc;
1838         ENTRY;
1839
1840         oap = oap_from_cookie(cookie);
1841         if (IS_ERR(oap))
1842                 RETURN(PTR_ERR(oap));
1843
1844         if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
1845                 RETURN(-EIO);
1846
1847         if (!list_empty(&oap->oap_pending_item) ||
1848             !list_empty(&oap->oap_urgent_item) ||
1849             !list_empty(&oap->oap_rpc_item))
1850                 RETURN(-EBUSY);
1851
1852         if (loi == NULL)
1853                 loi = &lsm->lsm_oinfo[0];
1854
1855         spin_lock(&cli->cl_loi_list_lock);
1856
1857         oap->oap_cmd = cmd;
1858         oap->oap_async_flags = async_flags;
1859         oap->oap_page_off = off;
1860         oap->oap_count = count;
1861         oap->oap_brw_flags = brw_flags;
1862
1863         if (cmd == OBD_BRW_WRITE) {
1864                 rc = osc_enter_cache(cli, loi, oap);
1865                 if (rc) {
1866                         spin_unlock(&cli->cl_loi_list_lock);
1867                         RETURN(rc);
1868                 }
1869                 lop = &loi->loi_write_lop;
1870         } else {
1871                 lop = &loi->loi_read_lop;
1872         }
1873
1874         if (oap->oap_async_flags & ASYNC_URGENT)
1875                 list_add(&oap->oap_urgent_item, &lop->lop_urgent);
1876         list_add_tail(&oap->oap_pending_item, &lop->lop_pending);
1877         lop_update_pending(cli, lop, cmd, 1);
1878
1879         loi_list_maint(cli, loi);
1880
1881         LOI_DEBUG(loi, "oap %p page %p added for cmd %d\n", oap, oap->oap_page,
1882                   cmd);
1883
1884         osc_check_rpcs(cli);
1885         spin_unlock(&cli->cl_loi_list_lock);
1886
1887         RETURN(0);
1888 }
1889
1890 /* aka (~was & now & flag), but this is more clear :) */
1891 #define SETTING(was, now, flag) (!(was & flag) && (now & flag))
1892
1893 static int osc_set_async_flags(struct obd_export *exp,
1894                                struct lov_stripe_md *lsm,
1895                                struct lov_oinfo *loi, void *cookie,
1896                                obd_flags async_flags)
1897 {
1898         struct client_obd *cli = &exp->exp_obd->u.cli;
1899         struct loi_oap_pages *lop;
1900         struct osc_async_page *oap;
1901         int rc = 0;
1902         ENTRY;
1903
1904         oap = oap_from_cookie(cookie);
1905         if (IS_ERR(oap))
1906                 RETURN(PTR_ERR(oap));
1907
1908         if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
1909                 RETURN(-EIO);
1910
1911         if (loi == NULL)
1912                 loi = &lsm->lsm_oinfo[0];
1913
1914         if (oap->oap_cmd == OBD_BRW_WRITE) {
1915                 lop = &loi->loi_write_lop;
1916         } else {
1917                 lop = &loi->loi_read_lop;
1918         }
1919
1920         spin_lock(&cli->cl_loi_list_lock);
1921
1922         if (list_empty(&oap->oap_pending_item))
1923                 GOTO(out, rc = -EINVAL);
1924
1925         if ((oap->oap_async_flags & async_flags) == async_flags)
1926                 GOTO(out, rc = 0);
1927
1928         if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY))
1929                 oap->oap_async_flags |= ASYNC_READY;
1930
1931         if (SETTING(oap->oap_async_flags, async_flags, ASYNC_URGENT)) {
1932                 if (list_empty(&oap->oap_rpc_item)) {
1933                         list_add(&oap->oap_urgent_item, &lop->lop_urgent);
1934                         loi_list_maint(cli, loi);
1935                 }
1936         }
1937
1938         LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page,
1939                         oap->oap_async_flags);
1940 out:
1941         osc_check_rpcs(cli);
1942         spin_unlock(&cli->cl_loi_list_lock);
1943         RETURN(rc);
1944 }
1945
1946 static int osc_queue_group_io(struct obd_export *exp, struct lov_stripe_md *lsm,
1947                              struct lov_oinfo *loi,
1948                              struct obd_io_group *oig, void *cookie,
1949                              int cmd, obd_off off, int count,
1950                              obd_flags brw_flags,
1951                              obd_flags async_flags)
1952 {
1953         struct client_obd *cli = &exp->exp_obd->u.cli;
1954         struct osc_async_page *oap;
1955         struct loi_oap_pages *lop;
1956         ENTRY;
1957
1958         oap = oap_from_cookie(cookie);
1959         if (IS_ERR(oap))
1960                 RETURN(PTR_ERR(oap));
1961
1962         if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
1963                 RETURN(-EIO);
1964
1965         if (!list_empty(&oap->oap_pending_item) ||
1966             !list_empty(&oap->oap_urgent_item) ||
1967             !list_empty(&oap->oap_rpc_item))
1968                 RETURN(-EBUSY);
1969
1970         if (loi == NULL)
1971                 loi = &lsm->lsm_oinfo[0];
1972
1973         spin_lock(&cli->cl_loi_list_lock);
1974
1975         oap->oap_cmd = cmd;
1976         oap->oap_page_off = off;
1977         oap->oap_count = count;
1978         oap->oap_brw_flags = brw_flags;
1979         oap->oap_async_flags = async_flags;
1980
1981         if (cmd == OBD_BRW_WRITE)
1982                 lop = &loi->loi_write_lop;
1983         else
1984                 lop = &loi->loi_read_lop;
1985
1986         list_add_tail(&oap->oap_pending_item, &lop->lop_pending_group);
1987         if (oap->oap_async_flags & ASYNC_GROUP_SYNC) {
1988                 oap->oap_oig = oig;
1989                 oig_add_one(oig, &oap->oap_occ);
1990         }
1991
1992         LOI_DEBUG(loi, "oap %p page %p on group pending\n", oap, oap->oap_page);
1993
1994         spin_unlock(&cli->cl_loi_list_lock);
1995
1996         RETURN(0);
1997 }
1998
1999 static void osc_group_to_pending(struct client_obd *cli, struct lov_oinfo *loi,
2000                                  struct loi_oap_pages *lop, int cmd)
2001 {
2002         struct list_head *pos, *tmp;
2003         struct osc_async_page *oap;
2004
2005         list_for_each_safe(pos, tmp, &lop->lop_pending_group) {
2006                 oap = list_entry(pos, struct osc_async_page, oap_pending_item);
2007                 list_del(&oap->oap_pending_item);
2008                 list_add_tail(&oap->oap_pending_item, &lop->lop_pending);
2009                 list_add(&oap->oap_urgent_item, &lop->lop_urgent);
2010                 lop_update_pending(cli, lop, cmd, 1);
2011         }
2012         loi_list_maint(cli, loi);
2013 }
2014
2015 static int osc_trigger_group_io(struct obd_export *exp,
2016                                 struct lov_stripe_md *lsm,
2017                                 struct lov_oinfo *loi,
2018                                 struct obd_io_group *oig)
2019 {
2020         struct client_obd *cli = &exp->exp_obd->u.cli;
2021         ENTRY;
2022
2023         if (loi == NULL)
2024                 loi = &lsm->lsm_oinfo[0];
2025
2026         spin_lock(&cli->cl_loi_list_lock);
2027
2028         osc_group_to_pending(cli, loi, &loi->loi_write_lop, OBD_BRW_WRITE);
2029         osc_group_to_pending(cli, loi, &loi->loi_read_lop, OBD_BRW_READ);
2030
2031         osc_check_rpcs(cli);
2032         spin_unlock(&cli->cl_loi_list_lock);
2033
2034         RETURN(0);
2035 }
2036
2037 static int osc_teardown_async_page(struct obd_export *exp,
2038                                    struct lov_stripe_md *lsm,
2039                                    struct lov_oinfo *loi, void *cookie)
2040 {
2041         struct client_obd *cli = &exp->exp_obd->u.cli;
2042         struct loi_oap_pages *lop;
2043         struct osc_async_page *oap;
2044         int rc = 0;
2045         ENTRY;
2046
2047         oap = oap_from_cookie(cookie);
2048         if (IS_ERR(oap))
2049                 RETURN(PTR_ERR(oap));
2050
2051         if (loi == NULL)
2052                 loi = &lsm->lsm_oinfo[0];
2053
2054         if (oap->oap_cmd == OBD_BRW_WRITE) {
2055                 lop = &loi->loi_write_lop;
2056         } else {
2057                 lop = &loi->loi_read_lop;
2058         }
2059
2060         spin_lock(&cli->cl_loi_list_lock);
2061
2062         if (!list_empty(&oap->oap_rpc_item))
2063                 GOTO(out, rc = -EBUSY);
2064
2065         osc_exit_cache(cli, oap, 0);
2066         osc_wake_cache_waiters(cli);
2067
2068         if (!list_empty(&oap->oap_urgent_item)) {
2069                 list_del_init(&oap->oap_urgent_item);
2070                 oap->oap_async_flags &= ~ASYNC_URGENT;
2071         }
2072         if (!list_empty(&oap->oap_pending_item)) {
2073                 list_del_init(&oap->oap_pending_item);
2074                 lop_update_pending(cli, lop, oap->oap_cmd, -1);
2075         }
2076         loi_list_maint(cli, loi);
2077
2078         LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page);
2079 out:
2080         spin_unlock(&cli->cl_loi_list_lock);
2081         if (rc == 0)
2082                 OBD_FREE(oap, sizeof(*oap));
2083         RETURN(rc);
2084 }
2085
2086 #ifdef __KERNEL__
2087 /* Note: caller will lock/unlock, and set uptodate on the pages */
2088 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2089 static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa,
2090                            struct lov_stripe_md *lsm, obd_count page_count,
2091                            struct brw_page *pga)
2092 {
2093         struct ptlrpc_request *request = NULL;
2094         struct ost_body *body;
2095         struct niobuf_remote *nioptr;
2096         struct obd_ioobj *iooptr;
2097         int rc, size[3] = {sizeof(*body)}, mapped = 0;
2098         int swab;
2099         ENTRY;
2100
2101         /* XXX does not handle 'new' brw protocol */
2102
2103         size[1] = sizeof(struct obd_ioobj);
2104         size[2] = page_count * sizeof(*nioptr);
2105
2106         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
2107                                   OST_SAN_READ, 3, size, NULL);
2108         if (!request)
2109                 RETURN(-ENOMEM);
2110
2111         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof(*body));
2112         iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof(*iooptr));
2113         nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
2114                                 sizeof(*nioptr) * page_count);
2115
2116         memcpy(&body->oa, oa, sizeof(body->oa));
2117
2118         obdo_to_ioobj(oa, iooptr);
2119         iooptr->ioo_bufcnt = page_count;
2120
2121         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
2122                 LASSERT(PageLocked(pga[mapped].pg));
2123                 LASSERT(mapped == 0 ||
2124                         pga[mapped].disk_offset > pga[mapped - 1].disk_offset);
2125
2126                 nioptr->offset = pga[mapped].disk_offset;
2127                 nioptr->len    = pga[mapped].count;
2128                 nioptr->flags  = pga[mapped].flag;
2129         }
2130
2131         size[1] = page_count * sizeof(*nioptr);
2132         request->rq_replen = lustre_msg_size(2, size);
2133
2134         rc = ptlrpc_queue_wait(request);
2135         if (rc)
2136                 GOTO(out_req, rc);
2137
2138         body = lustre_swab_repbuf(request, 0, sizeof(*body),
2139                                   lustre_swab_ost_body);
2140         if (body == NULL) {
2141                 CERROR("Can't unpack body\n");
2142                 GOTO(out_req, rc = -EPROTO);
2143         }
2144
2145         memcpy(oa, &body->oa, sizeof(*oa));
2146
2147         swab = lustre_msg_swabbed(request->rq_repmsg);
2148         LASSERT_REPSWAB(request, 1);
2149         nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]);
2150         if (!nioptr) {
2151                 /* nioptr missing or short */
2152                 GOTO(out_req, rc = -EPROTO);
2153         }
2154
2155         /* actual read */
2156         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
2157                 struct page *page = pga[mapped].pg;
2158                 struct buffer_head *bh;
2159                 kdev_t dev;
2160
2161                 if (swab)
2162                         lustre_swab_niobuf_remote (nioptr);
2163
2164                 /* got san device associated */
2165                 LASSERT(exp->exp_obd != NULL);
2166                 dev = exp->exp_obd->u.cli.cl_sandev;
2167
2168                 /* hole */
2169                 if (!nioptr->offset) {
2170                         CDEBUG(D_PAGE, "hole at ino %lu; index %ld\n",
2171                                         page->mapping->host->i_ino,
2172                                         page->index);
2173                         memset(page_address(page), 0, PAGE_SIZE);
2174                         continue;
2175                 }
2176
2177                 if (!page->buffers) {
2178                         create_empty_buffers(page, dev, PAGE_SIZE);
2179                         bh = page->buffers;
2180
2181                         clear_bit(BH_New, &bh->b_state);
2182                         set_bit(BH_Mapped, &bh->b_state);
2183                         bh->b_blocknr = (unsigned long)nioptr->offset;
2184
2185                         clear_bit(BH_Uptodate, &bh->b_state);
2186
2187                         ll_rw_block(READ, 1, &bh);
2188                 } else {
2189                         bh = page->buffers;
2190
2191                         /* if buffer already existed, it must be the
2192                          * one we mapped before, check it */
2193                         LASSERT(!test_bit(BH_New, &bh->b_state));
2194                         LASSERT(test_bit(BH_Mapped, &bh->b_state));
2195                         LASSERT(bh->b_blocknr == (unsigned long)nioptr->offset);
2196
2197                         /* wait it's io completion */
2198                         if (test_bit(BH_Lock, &bh->b_state))
2199                                 wait_on_buffer(bh);
2200
2201                         if (!test_bit(BH_Uptodate, &bh->b_state))
2202                                 ll_rw_block(READ, 1, &bh);
2203                 }
2204
2205
2206                 /* must do syncronous write here */
2207                 wait_on_buffer(bh);
2208                 if (!buffer_uptodate(bh)) {
2209                         /* I/O error */
2210                         rc = -EIO;
2211                         goto out_req;
2212                 }
2213         }
2214
2215 out_req:
2216         ptlrpc_req_finished(request);
2217         RETURN(rc);
2218 }
2219
2220 static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
2221                             struct lov_stripe_md *lsm, obd_count page_count,
2222                             struct brw_page *pga)
2223 {
2224         struct ptlrpc_request *request = NULL;
2225         struct ost_body *body;
2226         struct niobuf_remote *nioptr;
2227         struct obd_ioobj *iooptr;
2228         int rc, size[3] = {sizeof(*body)}, mapped = 0;
2229         int swab;
2230         ENTRY;
2231
2232         size[1] = sizeof(struct obd_ioobj);
2233         size[2] = page_count * sizeof(*nioptr);
2234
2235         request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
2236                                   OST_SAN_WRITE, 3, size, NULL);
2237         if (!request)
2238                 RETURN(-ENOMEM);
2239
2240         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
2241         iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr));
2242         nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
2243                                 sizeof (*nioptr) * page_count);
2244
2245         memcpy(&body->oa, oa, sizeof(body->oa));
2246
2247         obdo_to_ioobj(oa, iooptr);
2248         iooptr->ioo_bufcnt = page_count;
2249
2250         /* pack request */
2251         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
2252                 LASSERT(PageLocked(pga[mapped].pg));
2253                 LASSERT(mapped == 0 ||
2254                         pga[mapped].disk_offset > pga[mapped - 1].disk_offset);
2255
2256                 nioptr->offset = pga[mapped].disk_offset;
2257                 nioptr->len    = pga[mapped].count;
2258                 nioptr->flags  = pga[mapped].flag;
2259         }
2260
2261         size[1] = page_count * sizeof(*nioptr);
2262         request->rq_replen = lustre_msg_size(2, size);
2263
2264         rc = ptlrpc_queue_wait(request);
2265         if (rc)
2266                 GOTO(out_req, rc);
2267
2268         swab = lustre_msg_swabbed (request->rq_repmsg);
2269         LASSERT_REPSWAB (request, 1);
2270         nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]);
2271         if (!nioptr) {
2272                 CERROR("absent/short niobuf array\n");
2273                 GOTO(out_req, rc = -EPROTO);
2274         }
2275
2276         /* actual write */
2277         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
2278                 struct page *page = pga[mapped].pg;
2279                 struct buffer_head *bh;
2280                 kdev_t dev;
2281
2282                 if (swab)
2283                         lustre_swab_niobuf_remote (nioptr);
2284
2285                 /* got san device associated */
2286                 LASSERT(exp->exp_obd != NULL);
2287                 dev = exp->exp_obd->u.cli.cl_sandev;
2288
2289                 if (!page->buffers) {
2290                         create_empty_buffers(page, dev, PAGE_SIZE);
2291                 } else {
2292                         /* checking */
2293                         LASSERT(!test_bit(BH_New, &page->buffers->b_state));
2294                         LASSERT(test_bit(BH_Mapped, &page->buffers->b_state));
2295                         LASSERT(page->buffers->b_blocknr ==
2296                                 (unsigned long)nioptr->offset);
2297                 }
2298                 bh = page->buffers;
2299
2300                 LASSERT(bh);
2301
2302                 /* if buffer locked, wait it's io completion */
2303                 if (test_bit(BH_Lock, &bh->b_state))
2304                         wait_on_buffer(bh);
2305
2306                 clear_bit(BH_New, &bh->b_state);
2307                 set_bit(BH_Mapped, &bh->b_state);
2308
2309                 /* override the block nr */
2310                 bh->b_blocknr = (unsigned long)nioptr->offset;
2311
2312                 /* we are about to write it, so set it
2313                  * uptodate/dirty
2314                  * page lock should garentee no race condition here */
2315                 set_bit(BH_Uptodate, &bh->b_state);
2316                 set_bit(BH_Dirty, &bh->b_state);
2317
2318                 ll_rw_block(WRITE, 1, &bh);
2319
2320                 /* must do syncronous write here */
2321                 wait_on_buffer(bh);
2322                 if (!buffer_uptodate(bh) || test_bit(BH_Dirty, &bh->b_state)) {
2323                         /* I/O error */
2324                         rc = -EIO;
2325                         goto out_req;
2326                 }
2327         }
2328
2329 out_req:
2330         ptlrpc_req_finished(request);
2331         RETURN(rc);
2332 }
2333
2334 static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
2335                       struct lov_stripe_md *lsm, obd_count page_count,
2336                       struct brw_page *pga, struct obd_trans_info *oti)
2337 {
2338         ENTRY;
2339
2340         while (page_count) {
2341                 obd_count pages_per_brw;
2342                 int rc;
2343
2344                 if (page_count > PTLRPC_MAX_BRW_PAGES)
2345                         pages_per_brw = PTLRPC_MAX_BRW_PAGES;
2346                 else
2347                         pages_per_brw = page_count;
2348
2349                 if (cmd & OBD_BRW_WRITE)
2350                         rc = sanosc_brw_write(exp, oa, lsm, pages_per_brw,pga);
2351                 else
2352                         rc = sanosc_brw_read(exp, oa, lsm, pages_per_brw, pga);
2353
2354                 if (rc != 0)
2355                         RETURN(rc);
2356
2357                 page_count -= pages_per_brw;
2358                 pga += pages_per_brw;
2359         }
2360         RETURN(0);
2361 }
2362 #endif
2363 #endif
2364
2365 static void osc_set_data_with_check(struct lustre_handle *lockh, void *data)
2366 {
2367         struct ldlm_lock *lock = ldlm_handle2lock(lockh);
2368
2369         if (lock == NULL) {
2370                 CERROR("lockh %p, data %p - client evicted?\n", lockh, data);
2371                 return;
2372         }
2373
2374         l_lock(&lock->l_resource->lr_namespace->ns_lock);
2375 #ifdef __KERNEL__
2376         if (lock->l_ast_data && lock->l_ast_data != data) {
2377                 struct inode *new_inode = data;
2378                 struct inode *old_inode = lock->l_ast_data;
2379                 LASSERTF(old_inode->i_state & I_FREEING,
2380                          "Found existing inode %p/%lu/%u state %lu in lock: "
2381                          "setting data to %p/%lu/%u\n", old_inode,
2382                          old_inode->i_ino, old_inode->i_generation,
2383                          old_inode->i_state,
2384                          new_inode, new_inode->i_ino, new_inode->i_generation);
2385         }
2386 #endif
2387         lock->l_ast_data = data;
2388         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
2389         LDLM_LOCK_PUT(lock);
2390 }
2391
2392 static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
2393                              ldlm_iterator_t replace, void *data)
2394 {
2395         struct ldlm_res_id res_id = { .name = {0} };
2396         struct obd_device *obd = class_exp2obd(exp);
2397
2398         res_id.name[0] = lsm->lsm_object_id;
2399         res_id.name[2] = lsm->lsm_object_gr;
2400         ldlm_change_cbdata(obd->obd_namespace, &res_id, replace, data);
2401         return 0;
2402 }
2403
2404 static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
2405                        __u32 type, ldlm_policy_data_t *policy, __u32 mode,
2406                        int *flags, void *bl_cb, void *cp_cb, void *gl_cb,
2407                        void *data, __u32 lvb_len, void *lvb_swabber,
2408                        struct lustre_handle *lockh)
2409 {
2410         struct obd_device *obd = exp->exp_obd;
2411         struct ldlm_res_id res_id = { .name = {0} };
2412         struct ost_lvb lvb;
2413         struct ldlm_reply *rep;
2414         struct ptlrpc_request *req = NULL;
2415         int rc;
2416         ENTRY;
2417
2418         res_id.name[0] = lsm->lsm_object_id;
2419         res_id.name[2] = lsm->lsm_object_gr;
2420
2421         /* Filesystem lock extents are extended to page boundaries so that
2422          * dealing with the page cache is a little smoother.  */
2423         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2424         policy->l_extent.end |= ~PAGE_MASK;
2425
2426         if (lsm->lsm_oinfo->loi_kms_valid == 0)
2427                 goto no_match;
2428
2429         /* Next, search for already existing extent locks that will cover us */
2430         rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode,
2431                              lockh);
2432         if (rc == 1) {
2433                 osc_set_data_with_check(lockh, data);
2434                 if (*flags & LDLM_FL_HAS_INTENT) {
2435                         /* I would like to be able to ASSERT here that rss <=
2436                          * kms, but I can't, for reasons which are explained in
2437                          * lov_enqueue() */
2438                 }
2439                 /* We already have a lock, and it's referenced */
2440                 RETURN(ELDLM_OK);
2441         }
2442
2443         /* If we're trying to read, we also search for an existing PW lock.  The
2444          * VFS and page cache already protect us locally, so lots of readers/
2445          * writers can share a single PW lock.
2446          *
2447          * There are problems with conversion deadlocks, so instead of
2448          * converting a read lock to a write lock, we'll just enqueue a new
2449          * one.
2450          *
2451          * At some point we should cancel the read lock instead of making them
2452          * send us a blocking callback, but there are problems with canceling
2453          * locks out from other users right now, too. */
2454
2455         if (mode == LCK_PR) {
2456                 rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type,
2457                                      policy, LCK_PW, lockh);
2458                 if (rc == 1) {
2459                         /* FIXME: This is not incredibly elegant, but it might
2460                          * be more elegant than adding another parameter to
2461                          * lock_match.  I want a second opinion. */
2462                         ldlm_lock_addref(lockh, LCK_PR);
2463                         ldlm_lock_decref(lockh, LCK_PW);
2464                         osc_set_data_with_check(lockh, data);
2465                         RETURN(ELDLM_OK);
2466                 }
2467         }
2468         if (mode == LCK_PW) {
2469                 rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type,
2470                                      policy, LCK_PR, lockh);
2471                 if (rc == 1) {
2472                         rc = ldlm_cli_convert(lockh, mode, flags);
2473                         if (!rc) {
2474                                 /* Update readers/writers accounting */
2475                                 ldlm_lock_addref(lockh, LCK_PW);
2476                                 ldlm_lock_decref(lockh, LCK_PR);
2477                                 osc_set_data_with_check(lockh, data);
2478                                 RETURN(ELDLM_OK);
2479                         }
2480                         /* If the conversion failed, we need to drop refcount
2481                            on matched lock before we get new one */
2482                         /* XXX Won't it save us some efforts if we cancel PR
2483                            lock here? We are going to take PW lock anyway and it
2484                            will invalidate PR lock */
2485                         ldlm_lock_decref(lockh, LCK_PR);
2486                         if (rc != EDEADLOCK) {
2487                                 RETURN(rc);
2488                         }
2489                 }
2490         }
2491
2492  no_match:
2493         if (*flags & LDLM_FL_HAS_INTENT) {
2494                 int size[2] = {0, sizeof(struct ldlm_request)};
2495
2496                 req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
2497                                       LDLM_ENQUEUE, 2, size, NULL);
2498                 if (req == NULL)
2499                         RETURN(-ENOMEM);
2500
2501                 size[0] = sizeof(*rep);
2502                 size[1] = sizeof(lvb);
2503                 req->rq_replen = lustre_msg_size(2, size);
2504         }
2505         rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type,
2506                               policy, mode, flags, bl_cb, cp_cb, gl_cb, data,
2507                               &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh);
2508         if (req != NULL) {
2509                 if (rc == ELDLM_LOCK_ABORTED) {
2510                         /* swabbed by ldlm_cli_enqueue() */
2511                         LASSERT_REPSWABBED(req, 0);
2512                         rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep));
2513                         LASSERT(rep != NULL);
2514                         if (rep->lock_policy_res1)
2515                                 rc = rep->lock_policy_res1;
2516                 }
2517                 ptlrpc_req_finished(req);
2518         }
2519
2520         if ((*flags & LDLM_FL_HAS_INTENT && rc == ELDLM_LOCK_ABORTED) || !rc) {
2521                 CDEBUG(D_INODE, "received kms == "LPU64", blocks == "LPU64"\n",
2522                        lvb.lvb_size, lvb.lvb_blocks);
2523                 lsm->lsm_oinfo->loi_rss = lvb.lvb_size;
2524                 lsm->lsm_oinfo->loi_blocks = lvb.lvb_blocks;
2525         }
2526
2527         RETURN(rc);
2528 }
2529
2530 static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
2531                      __u32 type, ldlm_policy_data_t *policy, __u32 mode,
2532                      int *flags, void *data, struct lustre_handle *lockh)
2533 {
2534         struct ldlm_res_id res_id = { .name = {0} };
2535         struct obd_device *obd = exp->exp_obd;
2536         int rc;
2537         ENTRY;
2538
2539         res_id.name[0] = lsm->lsm_object_id;
2540         res_id.name[2] = lsm->lsm_object_gr;
2541
2542         OBD_FAIL_RETURN(OBD_FAIL_OSC_MATCH, -EIO);
2543
2544         /* Filesystem lock extents are extended to page boundaries so that
2545          * dealing with the page cache is a little smoother */
2546         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2547         policy->l_extent.end |= ~PAGE_MASK;
2548
2549         /* Next, search for already existing extent locks that will cover us */
2550         rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
2551                              policy, mode, lockh);
2552         if (rc) {
2553                // if (!(*flags & LDLM_FL_TEST_LOCK))
2554                         osc_set_data_with_check(lockh, data);
2555                 RETURN(rc);
2556         }
2557         /* If we're trying to read, we also search for an existing PW lock.  The
2558          * VFS and page cache already protect us locally, so lots of readers/
2559          * writers can share a single PW lock. */
2560         if (mode == LCK_PR) {
2561                 rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
2562                                      policy, LCK_PW, lockh);
2563                 if (rc == 1 && !(*flags & LDLM_FL_TEST_LOCK)) {
2564                         /* FIXME: This is not incredibly elegant, but it might
2565                          * be more elegant than adding another parameter to
2566                          * lock_match.  I want a second opinion. */
2567                         osc_set_data_with_check(lockh, data);
2568                         ldlm_lock_addref(lockh, LCK_PR);
2569                         ldlm_lock_decref(lockh, LCK_PW);
2570                 }
2571         }
2572         RETURN(rc);
2573 }
2574
2575 static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
2576                       __u32 mode, struct lustre_handle *lockh)
2577 {
2578         ENTRY;
2579
2580         if (mode == LCK_GROUP)
2581                 ldlm_lock_decref_and_cancel(lockh, mode);
2582         else
2583                 ldlm_lock_decref(lockh, mode);
2584
2585         RETURN(0);
2586 }
2587
2588 static int osc_cancel_unused(struct obd_export *exp,
2589                              struct lov_stripe_md *lsm, int flags, void *opaque)
2590 {
2591         struct obd_device *obd = class_exp2obd(exp);
2592         struct ldlm_res_id res_id = { .name = {0} }, *resp = NULL;
2593
2594         if (lsm != NULL) {
2595                 res_id.name[0] = lsm->lsm_object_id;
2596                 res_id.name[2] = lsm->lsm_object_gr;
2597                 resp = &res_id;
2598         }
2599
2600         return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
2601 }
2602
2603 static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
2604                       unsigned long max_age)
2605 {
2606         struct obd_statfs *msfs;
2607         struct ptlrpc_request *request;
2608         int rc, size = sizeof(*osfs);
2609         ENTRY;
2610
2611         /* We could possibly pass max_age in the request (as an absolute
2612          * timestamp or a "seconds.usec ago") so the target can avoid doing
2613          * extra calls into the filesystem if that isn't necessary (e.g.
2614          * during mount that would help a bit).  Having relative timestamps
2615          * is not so great if request processing is slow, while absolute
2616          * timestamps are not ideal because they need time synchronization. */
2617         request = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OBD_VERSION,
2618                                   OST_STATFS, 0, NULL, NULL);
2619         if (!request)
2620                 RETURN(-ENOMEM);
2621
2622         request->rq_replen = lustre_msg_size(1, &size);
2623         request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
2624
2625         rc = ptlrpc_queue_wait(request);
2626         if (rc)
2627                 GOTO(out, rc);
2628
2629         msfs = lustre_swab_repbuf(request, 0, sizeof(*msfs),
2630                                   lustre_swab_obd_statfs);
2631         if (msfs == NULL) {
2632                 CERROR("Can't unpack obd_statfs\n");
2633                 GOTO(out, rc = -EPROTO);
2634         }
2635
2636         memcpy(osfs, msfs, sizeof(*osfs));
2637
2638         EXIT;
2639  out:
2640         ptlrpc_req_finished(request);
2641         return rc;
2642 }
2643
2644 /* Retrieve object striping information.
2645  *
2646  * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
2647  * the maximum number of OST indices which will fit in the user buffer.
2648  * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
2649  */
2650 static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
2651 {
2652         struct lov_user_md lum, *lumk;
2653         int rc, lum_size;
2654         ENTRY;
2655
2656         if (!lsm)
2657                 RETURN(-ENODATA);
2658
2659         rc = copy_from_user(&lum, lump, sizeof(lum));
2660         if (rc)
2661                 RETURN(-EFAULT);
2662
2663         if (lum.lmm_magic != LOV_USER_MAGIC)
2664                 RETURN(-EINVAL);
2665
2666         if (lum.lmm_stripe_count > 0) {
2667                 lum_size = sizeof(lum) + sizeof(lum.lmm_objects[0]);
2668                 OBD_ALLOC(lumk, lum_size);
2669                 if (!lumk)
2670                         RETURN(-ENOMEM);
2671
2672                 lumk->lmm_objects[0].l_object_id = lsm->lsm_object_id;
2673                 lumk->lmm_objects[0].l_object_gr = lsm->lsm_object_gr;
2674         } else {
2675                 lum_size = sizeof(lum);
2676                 lumk = &lum;
2677         }
2678
2679         lumk->lmm_object_id = lsm->lsm_object_id;
2680         lumk->lmm_object_gr = lsm->lsm_object_gr;
2681         lumk->lmm_stripe_count = 1;
2682
2683         if (copy_to_user(lump, lumk, lum_size))
2684                 rc = -EFAULT;
2685
2686         if (lumk != &lum)
2687                 OBD_FREE(lumk, lum_size);
2688
2689         RETURN(rc);
2690 }
2691
2692 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
2693                          void *karg, void *uarg)
2694 {
2695         struct obd_device *obd = exp->exp_obd;
2696         struct obd_ioctl_data *data = karg;
2697         int err = 0;
2698         ENTRY;
2699
2700 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2701         MOD_INC_USE_COUNT;
2702 #else
2703         if (!try_module_get(THIS_MODULE)) {
2704                 CERROR("Can't get module. Is it alive?");
2705                 return -EINVAL;
2706         }
2707 #endif
2708         switch (cmd) {
2709         case OBD_IOC_LOV_GET_CONFIG: {
2710                 char *buf;
2711                 struct lov_desc *desc;
2712                 struct obd_uuid uuid;
2713
2714                 buf = NULL;
2715                 len = 0;
2716                 if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
2717                         GOTO(out, err = -EINVAL);
2718
2719                 data = (struct obd_ioctl_data *)buf;
2720
2721                 if (sizeof(*desc) > data->ioc_inllen1) {
2722                         OBD_FREE(buf, len);
2723                         GOTO(out, err = -EINVAL);
2724                 }
2725
2726                 if (data->ioc_inllen2 < sizeof(uuid)) {
2727                         OBD_FREE(buf, len);
2728                         GOTO(out, err = -EINVAL);
2729                 }
2730
2731                 if (data->ioc_inllen3 < sizeof(__u32)) {
2732                         OBD_FREE(buf, len);
2733                         GOTO(out, err = -EINVAL);
2734                 }
2735
2736                 desc = (struct lov_desc *)data->ioc_inlbuf1;
2737                 desc->ld_tgt_count = 1;
2738                 desc->ld_active_tgt_count = 1;
2739                 desc->ld_default_stripe_count = 1;
2740                 desc->ld_default_stripe_size = 0;
2741                 desc->ld_default_stripe_offset = 0;
2742                 desc->ld_pattern = 0;
2743                 memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
2744                 memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
2745                 *((__u32 *)data->ioc_inlbuf3) = 1;
2746
2747                 err = copy_to_user((void *)uarg, buf, len);
2748                 if (err)
2749                         err = -EFAULT;
2750                 obd_ioctl_freedata(buf, len);
2751                 GOTO(out, err);
2752         }
2753         case LL_IOC_LOV_SETSTRIPE:
2754                 err = obd_alloc_memmd(exp, karg);
2755                 if (err > 0)
2756                         err = 0;
2757                 GOTO(out, err);
2758         case LL_IOC_LOV_GETSTRIPE:
2759                 err = osc_getstripe(karg, uarg);
2760                 GOTO(out, err);
2761         case OBD_IOC_CLIENT_RECOVER:
2762                 err = ptlrpc_recover_import(obd->u.cli.cl_import,
2763                                             data->ioc_inlbuf1);
2764                 if (err > 0)
2765                         err = 0;
2766                 GOTO(out, err);
2767         case IOC_OSC_SET_ACTIVE:
2768                 err = ptlrpc_set_import_active(obd->u.cli.cl_import,
2769                                                data->ioc_offset);
2770                 GOTO(out, err);
2771         default:
2772                 CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", cmd, current->comm);
2773                 GOTO(out, err = -ENOTTY);
2774         }
2775 out:
2776 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2777         MOD_DEC_USE_COUNT;
2778 #else
2779         module_put(THIS_MODULE);
2780 #endif
2781         return err;
2782 }
2783
2784 static int osc_get_info(struct obd_export *exp, obd_count keylen,
2785                         void *key, __u32 *vallen, void *val)
2786 {
2787         ENTRY;
2788         if (!vallen || !val)
2789                 RETURN(-EFAULT);
2790
2791         if (keylen > strlen("lock_to_stripe") &&
2792             strcmp(key, "lock_to_stripe") == 0) {
2793                 __u32 *stripe = val;
2794                 *vallen = sizeof(*stripe);
2795                 *stripe = 0;
2796                 RETURN(0);
2797         } else if (keylen >= strlen("last_id") && strcmp(key, "last_id") == 0) {
2798                 struct ptlrpc_request *req;
2799                 obd_id *reply;
2800                 char *bufs[1] = {key};
2801                 int rc;
2802                 req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OBD_VERSION,
2803                                       OST_GET_INFO, 1, &keylen, bufs);
2804                 if (req == NULL)
2805                         RETURN(-ENOMEM);
2806
2807                 req->rq_replen = lustre_msg_size(1, vallen);
2808                 rc = ptlrpc_queue_wait(req);
2809                 if (rc)
2810                         GOTO(out, rc);
2811
2812                 reply = lustre_swab_repbuf(req, 0, sizeof(*reply),
2813                                            lustre_swab_ost_last_id);
2814                 if (reply == NULL) {
2815                         CERROR("Can't unpack OST last ID\n");
2816                         GOTO(out, rc = -EPROTO);
2817                 }
2818                 *((obd_id *)val) = *reply;
2819         out:
2820                 ptlrpc_req_finished(req);
2821                 RETURN(rc);
2822         }
2823         RETURN(-EPROTO);
2824 }
2825
2826 static int osc_set_info(struct obd_export *exp, obd_count keylen,
2827                         void *key, obd_count vallen, void *val)
2828 {
2829         struct obd_device  *obd = exp->exp_obd;
2830         struct obd_import *imp = class_exp2cliimp(exp);
2831         struct llog_ctxt *ctxt;
2832         int rc = 0;
2833         ENTRY;
2834
2835         if (keylen == strlen("next_id") &&
2836             memcmp(key, "next_id", strlen("next_id")) == 0) {
2837                 if (vallen != sizeof(obd_id))
2838                         RETURN(-EINVAL);
2839                 obd->u.cli.cl_oscc.oscc_next_id = *((obd_id*)val) + 1;
2840                 CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n",
2841                        exp->exp_obd->obd_name,
2842                        obd->u.cli.cl_oscc.oscc_next_id);
2843
2844                 RETURN(0);
2845         }
2846
2847         if (keylen == strlen("growth_count") &&
2848             memcmp(key, "growth_count", strlen("growth_count")) == 0) {
2849                 if (vallen != sizeof(int))
2850                         RETURN(-EINVAL);
2851                 obd->u.cli.cl_oscc.oscc_max_grow_count = *((int*)val);
2852                 RETURN(0);
2853         }
2854
2855         if (keylen == strlen("unlinked") &&
2856             memcmp(key, "unlinked", keylen) == 0) {
2857                 struct osc_creator *oscc = &obd->u.cli.cl_oscc;
2858                 spin_lock(&oscc->oscc_lock);
2859                 oscc->oscc_flags &= ~OSCC_FLAG_NOSPC;
2860                 spin_unlock(&oscc->oscc_lock);
2861                 RETURN(0);
2862         }
2863         if (keylen == strlen("unrecovery") &&
2864             memcmp(key, "unrecovery", keylen) == 0) {
2865                 struct osc_creator *oscc = &obd->u.cli.cl_oscc;
2866                 spin_lock(&oscc->oscc_lock);
2867                 oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING;
2868                 spin_unlock(&oscc->oscc_lock);
2869                 RETURN(0);
2870         }
2871         if (keylen == strlen("initial_recov") &&
2872             memcmp(key, "initial_recov", strlen("initial_recov")) == 0) {
2873                 struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
2874                 if (vallen != sizeof(int))
2875                         RETURN(-EINVAL);
2876                 imp->imp_initial_recov = *(int *)val;
2877                 CDEBUG(D_HA, "%s: set imp_no_init_recov = %d\n",
2878                        exp->exp_obd->obd_name,
2879                        imp->imp_initial_recov);
2880                 RETURN(0);
2881         }
2882
2883         if (keylen < strlen("mds_conn") ||
2884             memcmp(key, "mds_conn", strlen("mds_conn")) != 0)
2885                 RETURN(-EINVAL);
2886
2887         ctxt = llog_get_context(&exp->exp_obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT);
2888         if (ctxt) {
2889                 if (rc == 0)
2890                         rc = llog_initiator_connect(ctxt);
2891                 else
2892                         CERROR("cannot establish the connect for ctxt %p: %d\n",
2893                                ctxt, rc);
2894         }
2895
2896         imp->imp_server_timeout = 1;
2897         CDEBUG(D_HA, "pinging OST %s\n", imp->imp_target_uuid.uuid);
2898         imp->imp_pingable = 1;
2899
2900         RETURN(rc);
2901 }
2902
2903
2904 static struct llog_operations osc_size_repl_logops = {
2905         lop_cancel: llog_obd_repl_cancel
2906 };
2907
2908 static struct llog_operations osc_unlink_orig_logops;
2909 static int osc_llog_init(struct obd_device *obd, struct obd_llogs *llogs,
2910                          struct obd_device *tgt, int count,
2911                          struct llog_catid *catid)
2912 {
2913         int rc;
2914         ENTRY;
2915
2916         osc_unlink_orig_logops = llog_lvfs_ops;
2917         osc_unlink_orig_logops.lop_setup = llog_obd_origin_setup;
2918         osc_unlink_orig_logops.lop_cleanup = llog_catalog_cleanup;
2919         osc_unlink_orig_logops.lop_add = llog_catalog_add;
2920         osc_unlink_orig_logops.lop_connect = llog_origin_connect;
2921
2922         rc = obd_llog_setup(obd, llogs, LLOG_UNLINK_ORIG_CTXT, tgt, count,
2923                             &catid->lci_logid, &osc_unlink_orig_logops);
2924         if (rc)
2925                 RETURN(rc);
2926
2927         rc = obd_llog_setup(obd, llogs, LLOG_SIZE_REPL_CTXT, tgt, count, NULL,
2928                             &osc_size_repl_logops);
2929         RETURN(rc);
2930 }
2931
2932 static int osc_llog_finish(struct obd_device *obd,
2933                            struct obd_llogs *llogs, int count)
2934 {
2935         int rc;
2936         ENTRY;
2937
2938         rc = obd_llog_cleanup(llog_get_context(llogs, LLOG_UNLINK_ORIG_CTXT));
2939         if (rc)
2940                 RETURN(rc);
2941
2942         rc = obd_llog_cleanup(llog_get_context(llogs, LLOG_SIZE_REPL_CTXT));
2943         RETURN(rc);
2944 }
2945
2946
2947 static int osc_connect(struct lustre_handle *exph,
2948                        struct obd_device *obd, struct obd_uuid *cluuid,
2949                        unsigned long connect_flags)
2950 {
2951         int rc;
2952         ENTRY;
2953         rc = client_connect_import(exph, obd, cluuid, connect_flags);
2954         RETURN(rc);
2955 }
2956
2957 static int osc_disconnect(struct obd_export *exp, unsigned long flags)
2958 {
2959         struct obd_device *obd = class_exp2obd(exp);
2960         struct llog_ctxt *ctxt;
2961         int rc;
2962         ENTRY;
2963
2964         ctxt = llog_get_context(&obd->obd_llogs, LLOG_SIZE_REPL_CTXT);
2965         if (obd->u.cli.cl_conn_count == 1)
2966                 /* flush any remaining cancel messages out to the target */
2967                 llog_sync(ctxt, exp);
2968
2969         rc = client_disconnect_export(exp, flags);
2970         RETURN(rc);
2971 }
2972
2973 static int osc_import_event(struct obd_device *obd,
2974                             struct obd_import *imp,
2975                             enum obd_import_event event)
2976 {
2977         struct client_obd *cli;
2978         int rc = 0;
2979
2980         LASSERT(imp->imp_obd == obd);
2981
2982         switch (event) {
2983         case IMP_EVENT_DISCON: {
2984                 /* Only do this on the MDS OSC's */
2985                 if (imp->imp_server_timeout) {
2986                         struct osc_creator *oscc = &obd->u.cli.cl_oscc;
2987
2988                         spin_lock(&oscc->oscc_lock);
2989                         oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
2990                         spin_unlock(&oscc->oscc_lock);
2991                 }
2992                 break;
2993         }
2994         case IMP_EVENT_INACTIVE: {
2995                 if (obd->obd_observer)
2996                         rc = obd_notify(obd->obd_observer, obd, 0, 0);
2997                 break;
2998         }
2999         case IMP_EVENT_INVALIDATE: {
3000                 struct ldlm_namespace *ns = obd->obd_namespace;
3001
3002                 /* Reset grants */
3003                 cli = &obd->u.cli;
3004                 spin_lock(&cli->cl_loi_list_lock);
3005                 cli->cl_avail_grant = 0;
3006                 cli->cl_lost_grant = 0;
3007                 /* all pages go to failing rpcs due to the invalid import */
3008                 osc_check_rpcs(cli);
3009                 spin_unlock(&cli->cl_loi_list_lock);
3010
3011                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3012
3013                 break;
3014         }
3015         case IMP_EVENT_ACTIVE: {
3016                 /* Only do this on the MDS OSC's */
3017                 if (imp->imp_server_timeout) {
3018                         struct osc_creator *oscc = &obd->u.cli.cl_oscc;
3019
3020                         spin_lock(&oscc->oscc_lock);
3021                         oscc->oscc_flags &= ~OSCC_FLAG_NOSPC;
3022                         spin_unlock(&oscc->oscc_lock);
3023                 }
3024
3025                 if (obd->obd_observer)
3026                         rc = obd_notify(obd->obd_observer, obd, 1, 0);
3027                 break;
3028         }
3029         default:
3030                 CERROR("Unknown import event %d\n", event);
3031                 LBUG();
3032         }
3033         RETURN(rc);
3034 }
3035
3036 static int osc_attach(struct obd_device *dev, obd_count len, void *data)
3037 {
3038         struct lprocfs_static_vars lvars;
3039         int rc;
3040         ENTRY;
3041
3042         lprocfs_init_vars(osc,&lvars);
3043         rc = lprocfs_obd_attach(dev, lvars.obd_vars);
3044         if (rc < 0)
3045                 RETURN(rc);
3046
3047         rc = lproc_osc_attach_seqstat(dev);
3048         if (rc < 0) {
3049                 lprocfs_obd_detach(dev);
3050                 RETURN(rc);
3051         }
3052
3053         ptlrpc_lprocfs_register_obd(dev);
3054         RETURN(0);
3055 }
3056
3057 static int osc_detach(struct obd_device *dev)
3058 {
3059         ptlrpc_lprocfs_unregister_obd(dev);
3060         return lprocfs_obd_detach(dev);
3061 }
3062
3063 static int osc_setup(struct obd_device *obd, obd_count len, void *buf)
3064 {
3065         int rc;
3066         ENTRY;
3067         rc = ptlrpcd_addref();
3068         if (rc)
3069                 RETURN(rc);
3070
3071         rc = client_obd_setup(obd, len, buf);
3072         if (rc)
3073                 ptlrpcd_decref();
3074         else
3075                 oscc_init(obd);
3076
3077         RETURN(rc);
3078 }
3079
3080 static int osc_cleanup(struct obd_device *obd, int flags)
3081 {
3082         int rc;
3083
3084         rc = ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
3085                                     LDLM_FL_CONFIG_CHANGE, NULL);
3086         if (rc)
3087                 RETURN(rc);
3088
3089         rc = client_obd_cleanup(obd, flags);
3090         ptlrpcd_decref();
3091         RETURN(rc);
3092 }
3093
3094 struct obd_ops osc_obd_ops = {
3095         .o_owner                = THIS_MODULE,
3096         .o_attach               = osc_attach,
3097         .o_detach               = osc_detach,
3098         .o_setup                = osc_setup,
3099         .o_cleanup              = osc_cleanup,
3100         .o_add_conn             = client_import_add_conn,
3101         .o_del_conn             = client_import_del_conn,
3102         .o_connect              = osc_connect,
3103         .o_disconnect           = osc_disconnect,
3104         .o_statfs               = osc_statfs,
3105         .o_packmd               = osc_packmd,
3106         .o_unpackmd             = osc_unpackmd,
3107         .o_create               = osc_create,
3108         .o_destroy              = osc_destroy,
3109         .o_getattr              = osc_getattr,
3110         .o_getattr_async        = osc_getattr_async,
3111         .o_setattr              = osc_setattr,
3112         .o_brw                  = osc_brw,
3113         .o_brw_async            = osc_brw_async,
3114         .o_prep_async_page      = osc_prep_async_page,
3115         .o_queue_async_io       = osc_queue_async_io,
3116         .o_set_async_flags      = osc_set_async_flags,
3117         .o_queue_group_io       = osc_queue_group_io,
3118         .o_trigger_group_io     = osc_trigger_group_io,
3119         .o_teardown_async_page  = osc_teardown_async_page,
3120         .o_punch                = osc_punch,
3121         .o_sync                 = osc_sync,
3122         .o_enqueue              = osc_enqueue,
3123         .o_match                = osc_match,
3124         .o_change_cbdata        = osc_change_cbdata,
3125         .o_cancel               = osc_cancel,
3126         .o_cancel_unused        = osc_cancel_unused,
3127         .o_iocontrol            = osc_iocontrol,
3128         .o_get_info             = osc_get_info,
3129         .o_set_info             = osc_set_info,
3130         .o_import_event         = osc_import_event,
3131         .o_llog_init            = osc_llog_init,
3132         .o_llog_finish          = osc_llog_finish,
3133 };
3134
3135 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
3136 struct obd_ops sanosc_obd_ops = {
3137         .o_owner                = THIS_MODULE,
3138         .o_attach               = osc_attach,
3139         .o_detach               = osc_detach,
3140         .o_cleanup              = client_obd_cleanup,
3141         .o_add_conn             = client_import_add_conn,
3142         .o_del_conn             = client_import_del_conn,
3143         .o_connect              = osc_connect,
3144         .o_disconnect           = client_disconnect_export,
3145         .o_statfs               = osc_statfs,
3146         .o_packmd               = osc_packmd,
3147         .o_unpackmd             = osc_unpackmd,
3148         .o_create               = osc_real_create,
3149         .o_destroy              = osc_destroy,
3150         .o_getattr              = osc_getattr,
3151         .o_getattr_async        = osc_getattr_async,
3152         .o_setattr              = osc_setattr,
3153         .o_setup                = client_sanobd_setup,
3154         .o_brw                  = sanosc_brw,
3155         .o_punch                = osc_punch,
3156         .o_sync                 = osc_sync,
3157         .o_enqueue              = osc_enqueue,
3158         .o_match                = osc_match,
3159         .o_change_cbdata        = osc_change_cbdata,
3160         .o_cancel               = osc_cancel,
3161         .o_cancel_unused        = osc_cancel_unused,
3162         .o_iocontrol            = osc_iocontrol,
3163         .o_import_event         = osc_import_event,
3164         .o_llog_init            = osc_llog_init,
3165         .o_llog_finish          = osc_llog_finish,
3166 };
3167 #endif
3168
3169 int __init osc_init(void)
3170 {
3171         struct lprocfs_static_vars lvars;
3172 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
3173         struct lprocfs_static_vars sanlvars;
3174 #endif
3175         int rc;
3176         ENTRY;
3177
3178         lprocfs_init_vars(osc, &lvars);
3179 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
3180         lprocfs_init_vars(osc, &sanlvars);
3181 #endif
3182
3183         rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
3184                                  LUSTRE_OSC_NAME);
3185         if (rc)
3186                 RETURN(rc);
3187
3188 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
3189         rc = class_register_type(&sanosc_obd_ops, NULL, sanlvars.module_vars,
3190                                  LUSTRE_SANOSC_NAME);
3191         if (rc)
3192                 class_unregister_type(LUSTRE_OSC_NAME);
3193 #endif
3194
3195         RETURN(rc);
3196 }
3197
3198 #ifdef __KERNEL__
3199 static void /*__exit*/ osc_exit(void)
3200 {
3201 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
3202         class_unregister_type(LUSTRE_SANOSC_NAME);
3203 #endif
3204         class_unregister_type(LUSTRE_OSC_NAME);
3205 }
3206
3207 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
3208 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
3209 MODULE_LICENSE("GPL");
3210
3211 module_init(osc_init);
3212 module_exit(osc_exit);
3213 #endif