Whamcloud - gitweb
- merge 0.7rc1 from b_devel to HEAD (20030612 merge point)
[fs/lustre-release.git] / lustre / ost / ost_handler.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
5  *   Author: Peter J. Braam <braam@clusterfs.com>
6  *   Author: Phil Schwan <phil@clusterfs.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  *  Storage Target Handling functions
24  *  Lustre Object Server Module (OST)
25  *
26  *  This server is single threaded at present (but can easily be multi
27  *  threaded). For testing and management it is treated as an
28  *  obd_device, although it does not export a full OBD method table
29  *  (the requests are coming in over the wire, so object target
30  *  modules do not have a full method table.)
31  */
32
33 #define EXPORT_SYMTAB
34 #define DEBUG_SUBSYSTEM S_OST
35
36 #include <linux/module.h>
37 #include <linux/obd_ost.h>
38 #include <linux/lustre_net.h>
39 #include <linux/lustre_dlm.h>
40 #include <linux/lustre_export.h>
41 #include <linux/init.h>
42 #include <linux/lprocfs_status.h>
43
44 inline void oti_to_request(struct obd_trans_info *oti,
45                            struct ptlrpc_request *req)
46 {
47         int i;
48         struct oti_req_ack_lock *ack_lock;
49
50         if(oti == NULL)
51                 return;
52
53         if (req->rq_repmsg)
54                 req->rq_repmsg->transno = oti->oti_transno;
55
56         /* XXX 4 == entries in oti_ack_locks??? */
57         for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) {
58                 if (!ack_lock->mode)
59                         break;
60                 memcpy(&req->rq_ack_locks[i].lock, &ack_lock->lock,
61                        sizeof(req->rq_ack_locks[i].lock));
62                 req->rq_ack_locks[i].mode = ack_lock->mode;
63         }
64         EXIT;
65 }
66
67 static int ost_destroy(struct ptlrpc_request *req, struct obd_trans_info *oti)
68 {
69         struct lustre_handle *conn = &req->rq_reqmsg->handle;
70         struct ost_body *body;
71         int rc, size = sizeof(*body);
72         ENTRY;
73
74         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
75                                    lustre_swab_ost_body);
76         if (body == NULL)
77                 RETURN (-EFAULT);
78
79         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
80         if (rc)
81                 RETURN(rc);
82
83         req->rq_status = obd_destroy(conn, &body->oa, NULL, oti);
84         RETURN(0);
85 }
86
87 static int ost_getattr(struct ptlrpc_request *req)
88 {
89         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
90         struct ost_body *body, *repbody;
91         int rc, size = sizeof(*body);
92         ENTRY;
93
94         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
95                                    lustre_swab_ost_body);
96         if (body == NULL)
97                 RETURN (-EFAULT);
98
99         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
100         if (rc)
101                 RETURN(rc);
102
103         repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
104         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
105         req->rq_status = obd_getattr(conn, &repbody->oa, NULL);
106         RETURN(0);
107 }
108
109 static int ost_statfs(struct ptlrpc_request *req)
110 {
111         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
112         struct obd_statfs *osfs;
113         int rc, size = sizeof(*osfs);
114         ENTRY;
115
116         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
117         if (rc)
118                 RETURN(rc);
119
120         osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*osfs));
121         memset(osfs, 0, size);
122
123         req->rq_status = obd_statfs(conn, osfs);
124         if (req->rq_status != 0)
125                 CERROR("ost: statfs failed: rc %d\n", req->rq_status);
126
127         RETURN(0);
128 }
129
130 static int ost_syncfs(struct ptlrpc_request *req)
131 {
132         struct obd_statfs *osfs;
133         int rc, size = sizeof(*osfs);
134         ENTRY;
135
136         rc = lustre_pack_msg(0, &size, NULL, &req->rq_replen, &req->rq_repmsg);
137         if (rc)
138                 RETURN(rc);
139
140         rc = obd_syncfs(req->rq_export);
141         if (rc) {
142                 CERROR("ost: syncfs failed: rc %d\n", rc);
143                 req->rq_status = rc;
144                 RETURN(rc);
145         }
146
147         RETURN(0);
148 }
149
150 static int ost_open(struct ptlrpc_request *req, struct obd_trans_info *oti)
151 {
152         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
153         struct ost_body *body, *repbody;
154         int rc, size = sizeof(*repbody);
155         ENTRY;
156
157         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
158                                    lustre_swab_ost_body);
159         if (body == NULL)
160                 return (-EFAULT);
161
162         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
163         if (rc)
164                 RETURN(rc);
165
166         repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
167         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
168         req->rq_status = obd_open(conn, &repbody->oa, NULL, oti, NULL);
169         RETURN(0);
170 }
171
172 static int ost_close(struct ptlrpc_request *req, struct obd_trans_info *oti)
173 {
174         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
175         struct ost_body *body, *repbody;
176         int rc, size = sizeof(*repbody);
177         ENTRY;
178
179         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
180                                    lustre_swab_ost_body);
181         if (body == NULL)
182                 RETURN (-EFAULT);
183
184         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
185         if (rc)
186                 RETURN(rc);
187
188         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
189         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
190         req->rq_status = obd_close(conn, &repbody->oa, NULL, oti);
191         RETURN(0);
192 }
193
194 static int ost_create(struct ptlrpc_request *req, struct obd_trans_info *oti)
195 {
196         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
197         struct ost_body *body, *repbody;
198         int rc, size = sizeof(*repbody);
199         ENTRY;
200
201         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
202                                    lustre_swab_ost_body);
203         if (body == NULL)
204                 RETURN (-EFAULT);
205
206         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
207         if (rc)
208                 RETURN(rc);
209
210         repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
211         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
212         req->rq_status = obd_create(conn, &repbody->oa, NULL, oti);
213         RETURN(0);
214 }
215
216 static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti)
217 {
218         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
219         struct ost_body *body, *repbody;
220         int rc, size = sizeof(*repbody);
221         ENTRY;
222
223         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
224                                    lustre_swab_ost_body);
225         if (body == NULL)
226                 RETURN (-EFAULT);
227
228         if ((body->oa.o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) !=
229             (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))
230                 RETURN(-EINVAL);
231
232         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
233         if (rc)
234                 RETURN(rc);
235
236         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
237         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
238         req->rq_status = obd_punch(conn, &repbody->oa, NULL, repbody->oa.o_size,
239                                    repbody->oa.o_blocks, oti);
240         RETURN(0);
241 }
242
243 static int ost_setattr(struct ptlrpc_request *req, struct obd_trans_info *oti)
244 {
245         struct lustre_handle *conn = &req->rq_reqmsg->handle;
246         struct ost_body *body, *repbody;
247         int rc, size = sizeof(*repbody);
248         ENTRY;
249
250         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
251                                    lustre_swab_ost_body);
252         if (body == NULL)
253                 RETURN (-EFAULT);
254
255         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
256         if (rc)
257                 RETURN(rc);
258
259         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
260         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
261
262         req->rq_status = obd_setattr(conn, &repbody->oa, NULL, oti);
263         RETURN(0);
264 }
265
266 static int ost_bulk_timeout(void *data)
267 {
268         ENTRY;
269         /* We don't fail the connection here, because having the export
270          * killed makes the (vital) call to commitrw very sad.
271          */
272         RETURN(1);
273 }
274
275 static int get_per_page_niobufs (struct obd_ioobj *ioo, int nioo,
276                                  struct niobuf_remote *rnb, int nrnb,
277                                  struct niobuf_remote **pp_rnbp)
278 {
279         /* Copy a remote niobuf, splitting it into page-sized chunks
280          * and setting ioo[i].ioo_bufcnt accordingly */
281         struct niobuf_remote *pp_rnb;
282         int   i;
283         int   j;
284         int   page;
285         int   rnbidx = 0;
286         int   npages = 0;
287
288         /* first count and check the number of pages required */
289         for (i = 0; i < nioo; i++)
290                 for (j = 0; j < ioo->ioo_bufcnt; j++, rnbidx++) {
291                         obd_off offset = rnb[rnbidx].offset;
292                         obd_off p0 = offset >> PAGE_SHIFT;
293                         obd_off pn = (offset + rnb[rnbidx].len - 1)>>PAGE_SHIFT;
294
295                         LASSERT (rnbidx < nrnb);
296
297                         npages += (pn + 1 - p0);
298
299                         if (rnb[rnbidx].len == 0) {
300                                 CERROR("zero len BRW: obj %d objid "LPX64
301                                        " buf %u\n", i, ioo[i].ioo_id, j);
302                                 return (-EINVAL);
303                         }
304                         if (j > 0 &&
305                             rnb[rnbidx].offset <= rnb[rnbidx-1].offset) {
306                                 CERROR("unordered BRW: obj %d objid "LPX64
307                                        " buf %u offset "LPX64" <= "LPX64"\n",
308                                        i, ioo[i].ioo_id, j, rnb[rnbidx].offset,
309                                        rnb[rnbidx].offset);
310                                 return (-EINVAL);
311                         }
312                 }
313
314         LASSERT (rnbidx == nrnb);
315
316         if (npages == nrnb) {       /* all niobufs are for single pages */
317                 *pp_rnbp = rnb;
318                 return (npages);
319         }
320
321         OBD_ALLOC (pp_rnb, sizeof (*pp_rnb) * npages);
322         if (pp_rnb == NULL)
323                 return (-ENOMEM);
324
325         /* now do the actual split */
326         page = rnbidx = 0;
327         for (i = 0; i < nioo; i++) {
328                 int  obj_pages = 0;
329
330                 for (j = 0; j < ioo[i].ioo_bufcnt; j++, rnbidx++) {
331                         obd_off off = rnb[rnbidx].offset;
332                         int     nob = rnb[rnbidx].len;
333
334                         LASSERT (rnbidx < nrnb);
335                         do {
336                                 obd_off  poff = off & (PAGE_SIZE - 1);
337                                 int      pnob = (poff + nob > PAGE_SIZE) ?
338                                                 PAGE_SIZE - poff : nob;
339
340                                 LASSERT (page < npages);
341                                 pp_rnb[page].len = pnob;
342                                 pp_rnb[page].offset = off;
343                                 pp_rnb[page].flags = rnb->flags;
344
345                                 CDEBUG (D_PAGE, "   obj %d id "LPX64
346                                         "page %d(%d) "LPX64" for %d\n",
347                                         i, ioo[i].ioo_id, obj_pages, page,
348                                         pp_rnb[page].offset, pp_rnb[page].len);
349                                 page++;
350                                 obj_pages++;
351
352                                 off += pnob;
353                                 nob -= pnob;
354                         } while (nob > 0);
355                         LASSERT (nob == 0);
356                 }
357                 ioo[i].ioo_bufcnt = obj_pages;
358         }
359         LASSERT (page == npages);
360
361         *pp_rnbp = pp_rnb;
362         return (npages);
363 }
364
365 static void free_per_page_niobufs (int npages, struct niobuf_remote *pp_rnb,
366                                    struct niobuf_remote *rnb)
367 {
368         if (pp_rnb == rnb)                      /* didn't allocate above */
369                 return;
370
371         OBD_FREE (pp_rnb, sizeof (*pp_rnb) * npages);
372 }
373
374 #if CHECKSUM_BULK
375 __u64 ost_checksum_bulk (struct ptlrpc_bulk_desc *desc)
376 {
377         __u64             cksum = 0;
378         struct list_head *tmp;
379         char             *ptr;
380
381         list_for_each (tmp, &desc->bd_page_list) {
382                 struct ptlrpc_bulk_page *bp;
383
384                 bp = list_entry (tmp, struct ptlrpc_bulk_page, bp_link);
385                 ptr = kmap (bp->bp_page);
386                 ost_checksum (&cksum, ptr + bp->bp_pageoffset, bp->bp_buflen);
387                 kunmap (bp->bp_page);
388         }
389 }
390 #endif
391
392 static int ost_brw_read(struct ptlrpc_request *req)
393 {
394         struct ptlrpc_bulk_desc *desc;
395         struct niobuf_remote    *remote_nb;
396         struct niobuf_remote    *pp_rnb;
397         struct niobuf_local     *local_nb;
398         struct obd_ioobj        *ioo;
399         struct ost_body         *body;
400         struct l_wait_info       lwi;
401         void                    *desc_priv = NULL;
402         int                      size[1] = { sizeof(*body) };
403         int                      comms_error = 0;
404         int                      niocount;
405         int                      npages;
406         int                      nob = 0;
407         int                      rc;
408         int                      i;
409         ENTRY;
410
411         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
412                 GOTO(out, rc = -EIO);
413
414         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
415                                    lustre_swab_ost_body);
416         if (body == NULL) {
417                 CERROR ("Missing/short ost_body\n");
418                 GOTO (out, rc = -EFAULT);
419         }
420
421         ioo = lustre_swab_reqbuf (req, 1, sizeof (*ioo),
422                                   lustre_swab_obd_ioobj);
423         if (ioo == NULL) {
424                 CERROR ("Missing/short ioobj\n");
425                 GOTO (out, rc = -EFAULT);
426         }
427
428         niocount = ioo->ioo_bufcnt;
429         remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
430                                        lustre_swab_niobuf_remote);
431         if (remote_nb == NULL) {
432                 CERROR ("Missing/short niobuf\n");
433                 GOTO (out, rc = -EFAULT);
434         }
435         if (lustre_msg_swabbed (req->rq_reqmsg)) { /* swab remaining niobufs */
436                 for (i = 1; i < niocount; i++)
437                         lustre_swab_niobuf_remote (&remote_nb[i]);
438         }
439
440         rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
441         if (rc)
442                 GOTO(out, rc);
443
444         /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
445         npages = get_per_page_niobufs (ioo, 1, remote_nb, niocount, &pp_rnb);
446         if (npages < 0)
447                 GOTO(out, rc = npages);
448
449         OBD_ALLOC(local_nb, sizeof(*local_nb) * npages);
450         if (local_nb == NULL)
451                 GOTO(out_pp_rnb, rc = -ENOMEM);
452
453         desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, OST_BULK_PORTAL);
454         if (desc == NULL)
455                 GOTO(out_local, rc = -ENOMEM);
456
457         rc = obd_preprw(OBD_BRW_READ, req->rq_export, 1, ioo, npages,
458                         pp_rnb, local_nb, &desc_priv, NULL);
459         if (rc != 0)
460                 GOTO(out_bulk, rc);
461
462         nob = 0;
463         for (i = 0; i < npages; i++) {
464                 int page_rc = local_nb[i].rc;
465
466                 if (page_rc < 0) {              /* error */
467                         rc = page_rc;
468                         break;
469                 }
470
471                 LASSERT (page_rc <= pp_rnb[i].len);
472                 nob += page_rc;
473                 if (page_rc != 0) {             /* some data! */
474                         LASSERT (local_nb[i].page != NULL);
475                         rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
476                                                    pp_rnb[i].offset& ~PAGE_MASK,
477                                                    page_rc);
478                         if (rc != 0)
479                                 break;
480                 }
481
482                 if (page_rc != pp_rnb[i].len) { /* short read */
483                         /* All subsequent pages should be 0 */
484                         while (++i < npages)
485                                 LASSERT (local_nb[i].rc == 0);
486                         break;
487                 }
488         }
489
490         if (rc == 0) {
491                 rc = ptlrpc_bulk_put(desc);
492                 if (rc == 0) {
493                         lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
494                                           desc);
495                         rc = l_wait_event(desc->bd_waitq,
496                                           ptlrpc_bulk_complete(desc), &lwi);
497                         if (rc) {
498                                 LASSERT(rc == -ETIMEDOUT);
499                                 CERROR ("timeout waiting for bulk PUT\n");
500                                 ptlrpc_abort_bulk (desc);
501                         }
502                 }
503                 comms_error = rc != 0;
504         }
505
506         /* Must commit after prep above in all cases */
507         rc = obd_commitrw(OBD_BRW_READ, req->rq_export, 1, ioo, npages,
508                           local_nb, desc_priv, NULL);
509
510 #if CHECKSUM_BULK
511         if (rc == 0) {
512                 body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
513                 body->oa.o_rdev = ost_checksum_bulk (desc);
514                 body->oa.o_valid |= OBD_MD_FLCKSUM;
515         }
516 #endif
517
518  out_bulk:
519         ptlrpc_free_bulk (desc);
520  out_local:
521         OBD_FREE(local_nb, sizeof(*local_nb) * npages);
522  out_pp_rnb:
523         free_per_page_niobufs (npages, pp_rnb, remote_nb);
524  out:
525         LASSERT (rc <= 0);
526         if (rc == 0) {
527                 req->rq_status = nob;
528                 ptlrpc_reply(req);
529         } else if (!comms_error) {
530                 /* only reply if comms OK */
531                 req->rq_status = rc;
532                 ptlrpc_error(req);
533         } else {
534                 if (req->rq_repmsg != NULL) {
535                         /* reply out callback would free */
536                         OBD_FREE (req->rq_repmsg, req->rq_replen);
537                 }
538                 CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n",
539                        req->rq_export->exp_client_uuid.uuid,
540                        req->rq_connection->c_remote_uuid.uuid,
541                        req->rq_connection->c_peer.peer_nid);
542                 ptlrpc_fail_export(req->rq_export);
543         }
544
545         RETURN(rc);
546 }
547
548 static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
549 {
550         struct ptlrpc_bulk_desc *desc;
551         struct niobuf_remote    *remote_nb;
552         struct niobuf_remote    *pp_rnb;
553         struct niobuf_local     *local_nb;
554         struct obd_ioobj        *ioo;
555         struct ost_body         *body;
556         struct l_wait_info       lwi;
557         void                    *desc_priv = NULL;
558         __u32                   *rcs;
559         int                      size[2] = { sizeof (*body) };
560         int                      objcount, niocount, npages;
561         int                      comms_error = 0;
562         int                      rc, rc2, swab, i, j;
563         ENTRY;
564
565         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
566                 GOTO(out, rc = -EIO);
567
568         swab = lustre_msg_swabbed (req->rq_reqmsg);
569         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
570                                    lustre_swab_ost_body);
571         if (body == NULL) {
572                 CERROR ("Missing/short ost_body\n");
573                 GOTO(out, rc = -EFAULT);
574         }
575
576         LASSERT_REQSWAB (req, 1);
577         objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
578         if (objcount == 0) {
579                 CERROR ("Missing/short ioobj\n");
580                 GOTO (out, rc = -EFAULT);
581         }
582         ioo = lustre_msg_buf (req->rq_reqmsg, 1, objcount * sizeof (*ioo));
583         LASSERT (ioo != NULL);
584         for (niocount = i = 0; i < objcount; i++) {
585                 if (swab)
586                         lustre_swab_obd_ioobj (&ioo[i]);
587                 if (ioo[i].ioo_bufcnt == 0) {
588                         CERROR ("ioo[%d] has zero bufcnt\n", i);
589                         GOTO (out, rc = -EFAULT);
590                 }
591                 niocount += ioo[i].ioo_bufcnt;
592         }
593
594         remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
595                                        lustre_swab_niobuf_remote);
596         if (remote_nb == NULL) {
597                 CERROR ("Missing/short niobuf\n");
598                 GOTO(out, rc = -EFAULT);
599         }
600         if (swab) {                             /* swab the remaining niobufs */
601                 for (i = 1; i < niocount; i++)
602                         lustre_swab_niobuf_remote (&remote_nb[i]);
603         }
604
605         size[1] = niocount * sizeof (*rcs);
606         rc = lustre_pack_msg(2, size, NULL, &req->rq_replen,
607                              &req->rq_repmsg);
608         if (rc != 0)
609                 GOTO (out, rc);
610         rcs = lustre_msg_buf (req->rq_repmsg, 1, niocount * sizeof (*rcs));
611
612         /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
613         npages = get_per_page_niobufs(ioo, objcount,remote_nb,niocount,&pp_rnb);
614         if (npages < 0)
615                 GOTO (out, rc = npages);
616
617         OBD_ALLOC(local_nb, sizeof(*local_nb) * npages);
618         if (local_nb == NULL)
619                 GOTO(out_pp_rnb, rc = -ENOMEM);
620
621         desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, OST_BULK_PORTAL);
622         if (desc == NULL)
623                 GOTO(out_local, rc = -ENOMEM);
624
625         rc = obd_preprw(OBD_BRW_WRITE, req->rq_export, objcount, ioo,
626                         npages, pp_rnb, local_nb, &desc_priv, oti);
627         if (rc != 0)
628                 GOTO (out_bulk, rc);
629
630         /* NB Having prepped, we must commit... */
631
632         for (i = 0; i < npages; i++) {
633                 rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
634                                            pp_rnb[i].offset & (PAGE_SIZE - 1),
635                                            pp_rnb[i].len);
636                 if (rc != 0)
637                         break;
638         }
639
640         if (rc == 0) {
641                 rc = ptlrpc_bulk_get(desc);
642                 if (rc == 0) {
643                         lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
644                                           desc);
645                         rc = l_wait_event(desc->bd_waitq,
646                                           ptlrpc_bulk_complete(desc), &lwi);
647                         if (rc) {
648                                 LASSERT(rc == -ETIMEDOUT);
649                                 CERROR ("timeout waiting for bulk GET\n");
650                                 ptlrpc_abort_bulk (desc);
651                         }
652                 }
653                 comms_error = rc != 0;
654         }
655
656 #if CHECKSUM_BULK
657         if (rc == 0 && (body->oa.o_valid & OBD_MD_FLCKSUM) != 0) {
658                 static int cksum_counter;
659                 __u64 client_cksum = body->oa.o_rdev;
660                 __u64 cksum = ost_checksum_bulk (desc);
661
662                 if (client_cksum != cksum) {
663                         CERROR("Bad checksum: client "LPX64", server "LPX64
664                                ", client NID "LPX64"\n", client_cksum, cksum,
665                                req->rq_connection->c_peer.peer_nid);
666                         cksum_counter = 1;
667                 } else {
668                         cksum_counter++;
669                         if ((cksum_counter & (-cksum_counter)) == cksum_counter)
670                                 CERROR("Checksum %d from "LPX64": "LPX64" OK\n",
671                                         cksum_counter,
672                                         req->rq_connection->c_peer.peer_nid,
673                                         cksum);
674                 }
675         }
676 #endif
677         /* Must commit after prep above in all cases */
678         rc2 = obd_commitrw(OBD_BRW_WRITE, req->rq_export, objcount, ioo,
679                            npages, local_nb, desc_priv, oti);
680
681         if (rc == 0) {
682                 /* set per-requested niobuf return codes */
683                 for (i = j = 0; i < niocount; i++) {
684                         int nob = remote_nb[i].len;
685
686                         rcs[i] = 0;
687                         do {
688                                 LASSERT (j < npages);
689                                 if (local_nb[j].rc < 0)
690                                         rcs[i] = local_nb[j].rc;
691                                 nob -= pp_rnb[j].len;
692                                 j++;
693                         } while (nob > 0);
694                         LASSERT (nob == 0);
695                 }
696                 LASSERT (j == npages);
697         }
698         if (rc == 0)
699                 rc = rc2;
700
701  out_bulk:
702         ptlrpc_free_bulk (desc);
703  out_local:
704         OBD_FREE(local_nb, sizeof(*local_nb) * npages);
705  out_pp_rnb:
706         free_per_page_niobufs (npages, pp_rnb, remote_nb);
707  out:
708         if (rc == 0) {
709                 oti_to_request(oti, req);
710                 rc = ptlrpc_reply(req);
711         } else if (!comms_error) {
712                 /* Only reply if there was no comms problem with bulk */
713                 req->rq_status = rc;
714                 ptlrpc_error(req);
715         } else {
716                 if (req->rq_repmsg != NULL) {
717                         /* reply out callback would free */
718                         OBD_FREE (req->rq_repmsg, req->rq_replen);
719                 }
720                 CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n",
721                        req->rq_export->exp_client_uuid.uuid,
722                        req->rq_connection->c_remote_uuid.uuid,
723                        req->rq_connection->c_peer.peer_nid);
724                 ptlrpc_fail_export(req->rq_export);
725         }
726         RETURN(rc);
727 }
728
729 static int ost_san_brw(struct ptlrpc_request *req, int cmd)
730 {
731         struct lustre_handle *conn = &req->rq_reqmsg->handle;
732         struct niobuf_remote *remote_nb, *res_nb;
733         struct obd_ioobj *ioo;
734         struct ost_body *body;
735         int rc, i, j, objcount, niocount, size[2] = {sizeof(*body)};
736         int n;
737         int swab;
738         ENTRY;
739
740         /* XXX not set to use latest protocol */
741
742         swab = lustre_msg_swabbed (req->rq_reqmsg);
743         body = lustre_swab_reqbuf (req, 0, sizeof (*body),
744                                    lustre_swab_ost_body);
745         if (body == NULL) {
746                 CERROR ("Missing/short ost_body\n");
747                 GOTO (out, rc = -EFAULT);
748         }
749
750         ioo = lustre_swab_reqbuf(req, 1, sizeof (*ioo),
751                                  lustre_swab_obd_ioobj);
752         if (ioo == NULL) {
753                 CERROR ("Missing/short ioobj\n");
754                 GOTO (out, rc = -EFAULT);
755         }
756         objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
757         niocount = ioo[0].ioo_bufcnt;
758         for (i = 1; i < objcount; i++) {
759                 if (swab)
760                         lustre_swab_obd_ioobj (&ioo[i]);
761                 niocount += ioo[i].ioo_bufcnt;
762         }
763
764         remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
765                                        lustre_swab_niobuf_remote);
766         if (remote_nb == NULL) {
767                 CERROR ("Missing/short niobuf\n");
768                 GOTO (out, rc = -EFAULT);
769         }
770         if (swab) {                             /* swab the remaining niobufs */
771                 for (i = 1; i < niocount; i++)
772                         lustre_swab_niobuf_remote (&remote_nb[i]);
773         }
774
775         for (i = n = 0; i < objcount; i++) {
776                 for (j = 0; j < ioo[i].ioo_bufcnt; j++, n++) {
777                         if (remote_nb[n].len == 0) {
778                                 CERROR("zero len BRW: objid "LPX64" buf %u\n",
779                                        ioo[i].ioo_id, j);
780                                 GOTO(out, rc = -EINVAL);
781                         }
782                         if (j && remote_nb[n].offset <= remote_nb[n-1].offset) {
783                                 CERROR("unordered BRW: objid "LPX64
784                                        " buf %u offset "LPX64" <= "LPX64"\n",
785                                        ioo[i].ioo_id, j, remote_nb[n].offset,
786                                        remote_nb[n-1].offset);
787                                 GOTO(out, rc = -EINVAL);
788                         }
789                 }
790         }
791
792         size[1] = niocount * sizeof(*remote_nb);
793         rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg);
794         if (rc)
795                 GOTO(out, rc);
796
797         req->rq_status = obd_san_preprw(cmd, conn, objcount, ioo,
798                                         niocount, remote_nb);
799
800         if (req->rq_status)
801                 GOTO (out, rc = 0);
802
803         res_nb = lustre_msg_buf(req->rq_repmsg, 1, size[1]);
804         memcpy (res_nb, remote_nb, size[1]);
805         rc = 0;
806 out:
807         if (rc) {
808                 OBD_FREE(req->rq_repmsg, req->rq_replen);
809                 req->rq_repmsg = NULL;
810                 req->rq_status = rc;
811                 ptlrpc_error(req);
812         } else
813                 ptlrpc_reply(req);
814
815         return rc;
816 }
817
818 static int filter_recovery_request(struct ptlrpc_request *req,
819                                    struct obd_device *obd, int *process)
820 {
821         switch (req->rq_reqmsg->opc) {
822         case OST_CONNECT: /* This will never get here, but for completeness. */
823         case OST_DISCONNECT:
824                *process = 1;
825                RETURN(0);
826
827         case OBD_PING:
828         case OST_CLOSE:
829         case OST_CREATE:
830         case OST_DESTROY:
831         case OST_OPEN:
832         case OST_PUNCH:
833         case OST_SETATTR: 
834         case OST_SYNCFS:
835         case OST_WRITE:
836         case LDLM_ENQUEUE:
837                 *process = target_queue_recovery_request(req, obd);
838                 RETURN(0);
839
840         default:
841                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
842                 *process = 0;
843                 /* XXX what should we set rq_status to here? */
844                 req->rq_status = -EAGAIN;
845                 RETURN(ptlrpc_error(req));
846         }
847 }
848
849
850
851 static int ost_handle(struct ptlrpc_request *req)
852 {
853         struct obd_trans_info trans_info = { 0, }, *oti = &trans_info;
854         int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0;
855         ENTRY;
856
857         /* XXX identical to MDS */
858         if (req->rq_reqmsg->opc != OST_CONNECT) {
859                 struct obd_device *obd;
860                 int abort_recovery, recovering;
861
862                 if (req->rq_export == NULL) {
863                         CERROR("lustre_ost: operation %d on unconnected OST\n",
864                                req->rq_reqmsg->opc);
865                         req->rq_status = -ENOTCONN;
866                         GOTO(out, rc = -ENOTCONN);
867                 }
868
869                 obd = req->rq_export->exp_obd;
870
871                 /* Check for aborted recovery. */
872                 spin_lock_bh(&obd->obd_processing_task_lock);
873                 abort_recovery = obd->obd_abort_recovery;
874                 recovering = obd->obd_recovering;
875                 spin_unlock_bh(&obd->obd_processing_task_lock);
876                 if (abort_recovery) {
877                         target_abort_recovery(obd);
878                 } else if (recovering) {
879                         rc = filter_recovery_request(req, obd, &should_process);
880                         if (rc || !should_process)
881                                 RETURN(rc);
882                 }
883         } 
884
885         if (strcmp(req->rq_obd->obd_type->typ_name, "ost") != 0)
886                 GOTO(out, rc = -EINVAL);
887
888         switch (req->rq_reqmsg->opc) {
889         case OST_CONNECT:
890                 CDEBUG(D_INODE, "connect\n");
891                 OBD_FAIL_RETURN(OBD_FAIL_OST_CONNECT_NET, 0);
892                 rc = target_handle_connect(req, ost_handle);
893                 break;
894         case OST_DISCONNECT:
895                 CDEBUG(D_INODE, "disconnect\n");
896                 OBD_FAIL_RETURN(OBD_FAIL_OST_DISCONNECT_NET, 0);
897                 rc = target_handle_disconnect(req);
898                 break;
899         case OST_CREATE:
900                 CDEBUG(D_INODE, "create\n");
901                 OBD_FAIL_RETURN(OBD_FAIL_OST_CREATE_NET, 0);
902                 rc = ost_create(req, oti);
903                 break;
904         case OST_DESTROY:
905                 CDEBUG(D_INODE, "destroy\n");
906                 OBD_FAIL_RETURN(OBD_FAIL_OST_DESTROY_NET, 0);
907                 rc = ost_destroy(req, oti);
908                 break;
909         case OST_GETATTR:
910                 CDEBUG(D_INODE, "getattr\n");
911                 OBD_FAIL_RETURN(OBD_FAIL_OST_GETATTR_NET, 0);
912                 rc = ost_getattr(req);
913                 break;
914         case OST_SETATTR:
915                 CDEBUG(D_INODE, "setattr\n");
916                 OBD_FAIL_RETURN(OBD_FAIL_OST_SETATTR_NET, 0);
917                 rc = ost_setattr(req, oti);
918                 break;
919         case OST_OPEN:
920                 CDEBUG(D_INODE, "open\n");
921                 OBD_FAIL_RETURN(OBD_FAIL_OST_OPEN_NET, 0);
922                 rc = ost_open(req, oti);
923                 break;
924         case OST_CLOSE:
925                 CDEBUG(D_INODE, "close\n");
926                 OBD_FAIL_RETURN(OBD_FAIL_OST_CLOSE_NET, 0);
927                 rc = ost_close(req, oti);
928                 break;
929         case OST_WRITE:
930                 CDEBUG(D_INODE, "write\n");
931                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
932                 rc = ost_brw_write(req, oti);
933                 /* ost_brw sends its own replies */
934                 RETURN(rc);
935         case OST_READ:
936                 CDEBUG(D_INODE, "read\n");
937                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
938                 rc = ost_brw_read(req);
939                 /* ost_brw sends its own replies */
940                 RETURN(rc);
941         case OST_SAN_READ:
942                 CDEBUG(D_INODE, "san read\n");
943                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
944                 rc = ost_san_brw(req, OBD_BRW_READ);
945                 /* ost_san_brw sends its own replies */
946                 RETURN(rc);
947         case OST_SAN_WRITE:
948                 CDEBUG(D_INODE, "san write\n");
949                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
950                 rc = ost_san_brw(req, OBD_BRW_WRITE);
951                 /* ost_san_brw sends its own replies */
952                 RETURN(rc);
953         case OST_PUNCH:
954                 CDEBUG(D_INODE, "punch\n");
955                 OBD_FAIL_RETURN(OBD_FAIL_OST_PUNCH_NET, 0);
956                 rc = ost_punch(req, oti);
957                 break;
958         case OST_STATFS:
959                 CDEBUG(D_INODE, "statfs\n");
960                 OBD_FAIL_RETURN(OBD_FAIL_OST_STATFS_NET, 0);
961                 rc = ost_statfs(req);
962                 break;
963         case OST_SYNCFS:
964                 CDEBUG(D_INODE, "sync\n");
965                 OBD_FAIL_RETURN(OBD_FAIL_OST_SYNCFS_NET, 0);
966                 rc = ost_syncfs(req);
967                 break;
968         case OBD_PING:
969                 DEBUG_REQ(D_INODE, req, "ping");
970                 rc = target_handle_ping(req);
971                 break;
972         case LDLM_ENQUEUE:
973                 CDEBUG(D_INODE, "enqueue\n");
974                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
975                 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
976                                          ldlm_server_blocking_ast);
977                 fail = OBD_FAIL_OST_LDLM_REPLY_NET;
978                 break;
979         case LDLM_CONVERT:
980                 CDEBUG(D_INODE, "convert\n");
981                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CONVERT, 0);
982                 rc = ldlm_handle_convert(req);
983                 break;
984         case LDLM_CANCEL:
985                 CDEBUG(D_INODE, "cancel\n");
986                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CANCEL, 0);
987                 rc = ldlm_handle_cancel(req);
988                 break;
989         case LDLM_BL_CALLBACK:
990         case LDLM_CP_CALLBACK:
991                 CDEBUG(D_INODE, "callback\n");
992                 CERROR("callbacks should not happen on OST\n");
993                 /* fall through */
994         default:
995                 CERROR("Unexpected opcode %d\n", req->rq_reqmsg->opc);
996                 req->rq_status = -ENOTSUPP;
997                 rc = ptlrpc_error(req);
998                 RETURN(rc);
999         }
1000
1001         EXIT;
1002         /* If we're DISCONNECTing, the export_data is already freed */
1003         if (!rc && req->rq_reqmsg->opc != OST_DISCONNECT) {
1004                 struct obd_device *obd  = req->rq_export->exp_obd;
1005                 if (!obd->obd_no_transno) {
1006                         req->rq_repmsg->last_committed =
1007                                 obd->obd_last_committed;
1008                 } else {
1009                         DEBUG_REQ(D_IOCTL, req,
1010                                   "not sending last_committed update");
1011                 }
1012                 CDEBUG(D_INFO, "last_committed "LPU64", xid "LPX64"\n",
1013                        obd->obd_last_committed, req->rq_xid);
1014         }
1015
1016 out:
1017         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
1018                 struct obd_device *obd = req->rq_export->exp_obd;
1019
1020                 if (obd && obd->obd_recovering) {
1021                         DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
1022                         return target_queue_final_reply(req, rc);
1023                 }
1024                 /* Lost a race with recovery; let the error path DTRT. */
1025                 rc = req->rq_status = -ENOTCONN;
1026         }
1027
1028         if (!rc)
1029                 oti_to_request(oti, req);
1030
1031         target_send_reply(req, rc, fail);
1032         return 0;
1033 }
1034
1035 static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
1036 {
1037         struct ost_obd *ost = &obddev->u.ost;
1038         int err;
1039         int i;
1040         ENTRY;
1041
1042         ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
1043                                            OST_BUFSIZE, OST_MAXREQSIZE,
1044                                            OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
1045                                            ost_handle, "ost", obddev);
1046         if (!ost->ost_service) {
1047                 CERROR("failed to start service\n");
1048                 GOTO(error_disc, err = -ENOMEM);
1049         }
1050
1051         for (i = 0; i < OST_NUM_THREADS; i++) {
1052                 char name[32];
1053                 sprintf(name, "ll_ost_%02d", i);
1054                 err = ptlrpc_start_thread(obddev, ost->ost_service, name);
1055                 if (err) {
1056                         CERROR("error starting thread #%d: rc %d\n", i, err);
1057                         GOTO(error_disc, err = -EINVAL);
1058                 }
1059         }
1060
1061         RETURN(0);
1062
1063 error_disc:
1064         RETURN(err);
1065 }
1066
1067 static int ost_cleanup(struct obd_device *obddev, int force, int failover)
1068 {
1069         struct ost_obd *ost = &obddev->u.ost;
1070         int err = 0;
1071         ENTRY;
1072
1073         if (obddev->obd_recovering)
1074                 target_cancel_recovery_timer(obddev);
1075
1076         ptlrpc_stop_all_threads(ost->ost_service);
1077         ptlrpc_unregister_service(ost->ost_service);
1078
1079         RETURN(err);
1080 }
1081
1082 int ost_attach(struct obd_device *dev, obd_count len, void *data)
1083 {
1084         struct lprocfs_static_vars lvars;
1085
1086         lprocfs_init_vars(&lvars);
1087         return lprocfs_obd_attach(dev, lvars.obd_vars);
1088 }
1089
1090 int ost_detach(struct obd_device *dev)
1091 {
1092         return lprocfs_obd_detach(dev);
1093 }
1094
1095 /* I don't think this function is ever used, since nothing 
1096  * connects directly to this module.
1097  */
1098 static int ost_connect(struct lustre_handle *conn,
1099                        struct obd_device *obd, struct obd_uuid *cluuid)
1100 {
1101         struct obd_export *exp;
1102         int rc;
1103         ENTRY;
1104
1105         if (!conn || !obd || !cluuid)
1106                 RETURN(-EINVAL);
1107
1108         rc = class_connect(conn, obd, cluuid);
1109         if (rc)
1110                 RETURN(rc);
1111         exp = class_conn2export(conn);
1112         LASSERT(exp);
1113         class_export_put(exp);
1114
1115         RETURN(0);
1116 }
1117
1118 /* use obd ops to offer management infrastructure */
1119 static struct obd_ops ost_obd_ops = {
1120         o_owner:        THIS_MODULE,
1121         o_attach:       ost_attach,
1122         o_detach:       ost_detach,
1123         o_setup:        ost_setup,
1124         o_cleanup:      ost_cleanup,
1125         o_connect:      ost_connect,
1126 };
1127
1128 static int __init ost_init(void)
1129 {
1130         struct lprocfs_static_vars lvars;
1131         ENTRY;
1132
1133         lprocfs_init_vars(&lvars);
1134         RETURN(class_register_type(&ost_obd_ops, lvars.module_vars,
1135                                    LUSTRE_OST_NAME));
1136 }
1137
1138 static void __exit ost_exit(void)
1139 {
1140         class_unregister_type(LUSTRE_OST_NAME);
1141 }
1142
1143 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1144 MODULE_DESCRIPTION("Lustre Object Storage Target (OST) v0.01");
1145 MODULE_LICENSE("GPL");
1146
1147 module_init(ost_init);
1148 module_exit(ost_exit);