Whamcloud - gitweb
Branch: HEAD
[fs/lustre-release.git] / lustre / mds / handler.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/mds/handler.c
5  *  Lustre Metadata Server (mds) request handler
6  *
7  *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
8  *   Author: Peter Braam <braam@clusterfs.com>
9  *   Author: Andreas Dilger <adilger@clusterfs.com>
10  *   Author: Phil Schwan <phil@clusterfs.com>
11  *   Author: Mike Shaver <shaver@clusterfs.com>
12  *
13  *   This file is part of Lustre, http://www.lustre.org.
14  *
15  *   Lustre is free software; you can redistribute it and/or
16  *   modify it under the terms of version 2 of the GNU General Public
17  *   License as published by the Free Software Foundation.
18  *
19  *   Lustre is distributed in the hope that it will be useful,
20  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
21  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22  *   GNU General Public License for more details.
23  *
24  *   You should have received a copy of the GNU General Public License
25  *   along with Lustre; if not, write to the Free Software
26  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27  */
28
29 #ifndef EXPORT_SYMTAB
30 # define EXPORT_SYMTAB
31 #endif
32 #define DEBUG_SUBSYSTEM S_MDS
33
34 #include <linux/module.h>
35 #include <linux/lustre_mds.h>
36 #include <linux/lustre_dlm.h>
37 #include <linux/init.h>
38 #include <linux/obd_class.h>
39 #include <linux/random.h>
40 #include <linux/fs.h>
41 #include <linux/jbd.h>
42 #include <linux/namei.h>
43 #include <linux/ext3_fs.h>
44 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
45 # include <linux/smp_lock.h>
46 # include <linux/buffer_head.h>
47 # include <linux/workqueue.h>
48 # include <linux/mount.h>
49 #else
50 # include <linux/locks.h>
51 #endif
52 #include <linux/obd_lov.h>
53 #include <linux/obd_ost.h>
54 #include <linux/lustre_mds.h>
55 #include <linux/lustre_fsfilt.h>
56 #include <linux/lprocfs_status.h>
57 #include <linux/lustre_commit_confd.h>
58 #include <linux/lustre_acl.h>
59 #include <linux/lustre_gs.h>
60 #include "mds_internal.h"
61 #include <linux/lustre_sec.h>
62
63 static int mds_intent_policy(struct ldlm_namespace *ns,
64                              struct ldlm_lock **lockp, void *req_cookie,
65                              ldlm_mode_t mode, int flags, void *data);
66 static int mds_postsetup(struct obd_device *obd);
67 static int mds_cleanup(struct obd_device *obd, int flags);
68
69
70 /* Assumes caller has already pushed into the kernel filesystem context */
71 static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
72                         loff_t offset, int count)
73 {
74         struct ptlrpc_bulk_desc *desc;
75         struct l_wait_info lwi;
76         struct page **pages;
77         int rc = 0, npages, i, tmpcount, tmpsize = 0;
78         ENTRY;
79
80         LASSERT((offset & (PAGE_SIZE - 1)) == 0); /* I'm dubious about this */
81
82         npages = (count + PAGE_SIZE - 1) >> PAGE_SHIFT;
83         OBD_ALLOC(pages, sizeof(*pages) * npages);
84         if (!pages)
85                 GOTO(out, rc = -ENOMEM);
86
87         desc = ptlrpc_prep_bulk_exp(req, npages, BULK_PUT_SOURCE,
88                                     MDS_BULK_PORTAL);
89         if (desc == NULL)
90                 GOTO(out_free, rc = -ENOMEM);
91
92         for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
93                 tmpsize = tmpcount > PAGE_SIZE ? PAGE_SIZE : tmpcount;
94
95                 pages[i] = alloc_pages(GFP_KERNEL, 0);
96                 if (pages[i] == NULL)
97                         GOTO(cleanup_buf, rc = -ENOMEM);
98
99                 ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
100         }
101
102         for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
103                 tmpsize = tmpcount > PAGE_SIZE ? PAGE_SIZE : tmpcount;
104                 CDEBUG(D_EXT2, "reading %u@%llu from dir %lu (size %llu)\n",
105                        tmpsize, offset, file->f_dentry->d_inode->i_ino,
106                        file->f_dentry->d_inode->i_size);
107
108                 rc = fsfilt_readpage(req->rq_export->exp_obd, file,
109                                      kmap(pages[i]), tmpsize, &offset);
110                 kunmap(pages[i]);
111
112                 if (rc != tmpsize)
113                         GOTO(cleanup_buf, rc = -EIO);
114         }
115
116         LASSERT(desc->bd_nob == count);
117
118         rc = ptlrpc_start_bulk_transfer(desc);
119         if (rc)
120                 GOTO(cleanup_buf, rc);
121
122         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
123                 CERROR("obd_fail_loc=%x, fail operation rc=%d\n",
124                        OBD_FAIL_MDS_SENDPAGE, rc = -EIO);
125                 GOTO(abort_bulk, rc);
126         }
127
128         lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
129         rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
130         LASSERT (rc == 0 || rc == -ETIMEDOUT);
131
132         if (rc == 0) {
133                 if (desc->bd_success &&
134                     desc->bd_nob_transferred == count)
135                         GOTO(cleanup_buf, rc);
136
137                 rc = -ETIMEDOUT; /* XXX should this be a different errno? */
138         }
139
140         DEBUG_REQ(D_ERROR, req, "bulk failed: %s %d(%d), evicting %s@%s\n",
141                   (rc == -ETIMEDOUT) ? "timeout" : "network error",
142                   desc->bd_nob_transferred, count,
143                   req->rq_export->exp_client_uuid.uuid,
144                   req->rq_export->exp_connection->c_remote_uuid.uuid);
145
146         ptlrpc_fail_export(req->rq_export);
147
148         EXIT;
149  abort_bulk:
150         ptlrpc_abort_bulk (desc);
151  cleanup_buf:
152         for (i = 0; i < npages; i++)
153                 if (pages[i])
154                         __free_pages(pages[i], 0);
155
156         ptlrpc_free_bulk(desc);
157  out_free:
158         OBD_FREE(pages, sizeof(*pages) * npages);
159  out:
160         return rc;
161 }
162
163 extern char *ldlm_lockname[];
164
165 int mds_lock_mode_for_dir(struct obd_device *obd,
166                           struct dentry *dentry, int mode)
167 {
168         int ret_mode = 0, split;
169
170         /* any dir access needs couple locks:
171          * 1) on part of dir we gonna lookup/modify in
172          * 2) on a whole dir to protect it from concurrent splitting
173          *    and to flush client's cache for readdir()
174          * so, for a given mode and dentry this routine decides what
175          * lock mode to use for lock #2:
176          * 1) if caller's gonna lookup in dir then we need to protect
177          *    dir from being splitted only - LCK_CR
178          * 2) if caller's gonna modify dir then we need to protect
179          *    dir from being splitted and to flush cache - LCK_CW
180          * 3) if caller's gonna modify dir and that dir seems ready
181          *    for splitting then we need to protect it from any
182          *    type of access (lookup/modify/split) - LCK_EX -bzzz */
183
184         split = mds_splitting_expected(obd, dentry);
185         
186         /*
187          * it is important to check here only for MDS_NO_SPLITTABLE. The reason
188          * is that MDS_NO_SPLITTABLE means dir is not splittable in principle
189          * and another thread will not split it on the quiet. But if we have
190          * MDS_NO_SPLIT_EXPECTED, this means, that dir may be splitted anytime,
191          * but not now (for current thread) and we should consider that it can
192          * happen soon and go that branch which can yield LCK_EX to protect from
193          * possible splitting.
194          */
195         if (split == MDS_NO_SPLITTABLE) {
196                 /*
197                  * this inode won't be splitted. so we need not to protect from
198                  * just flush client's cache on modification.
199                  */
200                 if (mode == LCK_PW)
201                         ret_mode = LCK_CW;
202                 else
203                         ret_mode = 0;
204         } else {
205                 if (mode == LCK_EX) {
206                         ret_mode = LCK_EX;
207                 } else if (mode == LCK_PR) {
208                         ret_mode = LCK_CR;
209                 } else if (mode == LCK_PW) {
210                         /*
211                          * caller gonna modify directory. We use concurrent
212                          * write lock here to retract client's cache for
213                          * readdir.
214                          */
215                         if (split == MDS_EXPECT_SPLIT) {
216                                 /*
217                                  * splitting possible. serialize any access the
218                                  * idea is that first one seen dir is splittable
219                                  * is given exclusive lock and split
220                                  * directory. caller passes lock mode to
221                                  * mds_try_to_split_dir() and splitting would be
222                                  * done with exclusive lock only -bzzz.
223                                  */
224                                 CDEBUG(D_OTHER, "%s: gonna split %lu/%lu\n",
225                                        obd->obd_name,
226                                        (unsigned long)dentry->d_inode->i_ino,
227                                        (unsigned long)dentry->d_inode->i_generation);
228                                 ret_mode = LCK_EX;
229                         } else {
230                                 ret_mode = LCK_CW;
231                         }
232                 }
233         }
234
235         return ret_mode;        
236 }
237
238 /* only valid locked dentries or errors should be returned */
239 struct dentry *mds_id2locked_dentry(struct obd_device *obd, struct lustre_id *id,
240                                     struct vfsmount **mnt, int lock_mode,
241                                     struct lustre_handle *lockh, int *mode,
242                                     char *name, int namelen, __u64 lockpart)
243 {
244         struct dentry *de = mds_id2dentry(obd, id, mnt), *retval = de;
245         ldlm_policy_data_t policy = { .l_inodebits = { lockpart } };
246         struct ldlm_res_id res_id = { .name = {0} };
247         int flags = LDLM_FL_ATOMIC_CB, rc;
248         ENTRY;
249
250         if (IS_ERR(de))
251                 RETURN(de);
252
253         lockh[1].cookie = 0;
254         res_id.name[0] = id_fid(id);
255         res_id.name[1] = id_group(id);
256         
257 #ifdef S_PDIROPS
258         if (name && IS_PDIROPS(de->d_inode)) {
259                 ldlm_policy_data_t cpolicy =
260                         { .l_inodebits = { MDS_INODELOCK_UPDATE } };
261                 LASSERT(mode != NULL);
262                 *mode = mds_lock_mode_for_dir(obd, de, lock_mode);
263                 if (*mode) {
264                         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
265                                               res_id, LDLM_IBITS,
266                                               &cpolicy, *mode, &flags,
267                                               mds_blocking_ast,
268                                               ldlm_completion_ast, NULL, NULL,
269                                               NULL, 0, NULL, lockh + 1);
270                         if (rc != ELDLM_OK) {
271                                 l_dput(de);
272                                 RETURN(ERR_PTR(-ENOLCK));
273                         }
274                 }
275                 flags = LDLM_FL_ATOMIC_CB;
276
277                 res_id.name[2] = full_name_hash((unsigned char *)name, namelen);
278
279                 CDEBUG(D_INFO, "take lock on "DLID4":"LPX64"\n",
280                        OLID4(id), res_id.name[2]);
281         }
282 #else
283 #warning "No PDIROPS support in the kernel"
284 #endif
285         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
286                               LDLM_IBITS, &policy, lock_mode, &flags,
287                               mds_blocking_ast, ldlm_completion_ast,
288                               NULL, NULL, NULL, 0, NULL, lockh);
289         if (rc != ELDLM_OK) {
290                 l_dput(de);
291                 retval = ERR_PTR(-EIO); /* XXX translate ldlm code */
292 #ifdef S_PDIROPS
293                 if (lockh[1].cookie)
294                         ldlm_lock_decref(lockh + 1, *mode);
295 #endif
296         } else if (de->d_inode && de->d_inode->i_nlink == 0) {
297                 /* as sometimes we lookup inode by ino/generation through
298                    iopen mechanism, it's possible to find already unlinked
299                    inode with nlink == 0. let's interpretate the case as
300                    ENOENT -bzzz */
301                 CWARN("found already unlinked inode %lu/%u\n",
302                       de->d_inode->i_ino, de->d_inode->i_generation);
303                 l_dput(de);
304                 retval = ERR_PTR(-ENOENT);
305                 ldlm_lock_decref(lockh, lock_mode);
306 #ifdef S_PDIROPS
307                 if (lockh[1].cookie)
308                         ldlm_lock_decref(lockh + 1, *mode);
309 #endif
310         }
311
312         RETURN(retval);
313 }
314
315 #ifndef DCACHE_DISCONNECTED
316 #define DCACHE_DISCONNECTED DCACHE_NFSD_DISCONNECTED
317 #endif
318
319 /* Look up an entry by inode number. This function ONLY returns valid dget'd
320  * dentries with an initialized inode or errors */
321 struct dentry *mds_id2dentry(struct obd_device *obd, struct lustre_id *id,
322                              struct vfsmount **mnt)
323 {
324         unsigned long ino = (unsigned long)id_ino(id);
325         __u32 generation = (__u32)id_gen(id);
326         struct mds_obd *mds = &obd->u.mds;
327         struct dentry *result;
328         struct inode *inode;
329         char idname[32];
330
331         if (ino == 0)
332                 RETURN(ERR_PTR(-ESTALE));
333
334         snprintf(idname, sizeof(idname), "0x%lx", ino);
335
336         CDEBUG(D_DENTRY, "--> mds_id2dentry: ino/gen %lu/%u, sb %p\n",
337                ino, generation, mds->mds_sb);
338
339         /* under ext3 this is neither supposed to return bad inodes nor NULL
340            inodes. */
341         result = ll_lookup_one_len(idname, mds->mds_id_de, 
342                                    strlen(idname));
343         if (IS_ERR(result))
344                 RETURN(result);
345
346         inode = result->d_inode;
347         if (!inode)
348                 RETURN(ERR_PTR(-ENOENT));
349
350         if (is_bad_inode(inode)) {
351                 CERROR("bad inode returned %lu/%u\n",
352                        inode->i_ino, inode->i_generation);
353                 dput(result);
354                 RETURN(ERR_PTR(-ENOENT));
355         }
356
357         /* here we disabled generation check, as root inode i_generation
358          * of cache mds and real mds are different. */
359         if (inode->i_ino != id_ino(&mds->mds_rootid) && generation &&
360             inode->i_generation != generation) {
361                 /* we didn't find the right inode.. */
362                 if (id_group(id) != mds->mds_num) {
363                         CERROR("bad inode %lu found, link: %lu, ct: %d, generation "
364                                "%u != %u, mds %u != %u, request to wrong MDS?\n",
365                                inode->i_ino, (unsigned long)inode->i_nlink,
366                                atomic_read(&inode->i_count), inode->i_generation,
367                                generation, mds->mds_num, (unsigned)id_group(id));
368                 } else {
369                         CERROR("bad inode %lu found, link: %lu, ct: %d, generation "
370                                "%u != %u, inode is recreated while request handled?\n",
371                                inode->i_ino, (unsigned long)inode->i_nlink,
372                                atomic_read(&inode->i_count), inode->i_generation,
373                                generation);
374                 }
375                 dput(result);
376                 RETURN(ERR_PTR(-ENOENT));
377         }
378
379         if (mnt) {
380                 *mnt = mds->mds_vfsmnt;
381                 mntget(*mnt);
382         }
383
384         RETURN(result);
385 }
386
387 static
388 int mds_req_add_idmapping(struct ptlrpc_request *req,
389                           struct mds_export_data *med)
390 {
391         struct mds_req_sec_desc *rsd;
392         struct lustre_sec_desc  *lsd;
393         int rc;
394
395         if (!med->med_remote)
396                 return 0;
397
398         /* maybe we should do it more completely: invalidate the gss ctxt? */
399         if (req->rq_mapped_uid == MDS_IDMAP_NOTFOUND) {
400                 CWARN("didn't find mapped uid\n");
401                 return -EPERM;
402         }
403
404         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
405         if (!rsd) {
406                 CERROR("Can't unpack security desc\n");
407                 return -EPROTO;
408         }
409
410         lsd = mds_get_lsd(req->rq_mapped_uid);
411         if (!lsd) {
412                 CERROR("can't get LSD(%u), no mapping added\n",
413                        req->rq_mapped_uid);
414                 return -EPERM;
415         }
416
417         rc = mds_idmap_add(med->med_idmap, rsd->rsd_uid, lsd->lsd_uid,
418                            rsd->rsd_gid, lsd->lsd_gid);
419         mds_put_lsd(lsd);
420         return rc;
421 }
422
423 static
424 int mds_req_del_idmapping(struct ptlrpc_request *req,
425                           struct mds_export_data *med)
426 {
427         struct mds_req_sec_desc *rsd;
428         struct lustre_sec_desc  *lsd;
429         int rc;
430
431         if (!med->med_remote)
432                 return 0;
433
434         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
435         if (!rsd) {
436                 CERROR("Can't unpack security desc\n");
437                 return -EPROTO;
438         }
439
440         LASSERT(req->rq_mapped_uid != -1);
441         lsd = mds_get_lsd(req->rq_mapped_uid);
442         if (!lsd) {
443                 CERROR("can't get LSD(%u), no idmapping deleted\n",
444                        req->rq_mapped_uid);
445                 return -EPERM;
446         }
447
448         rc = mds_idmap_del(med->med_idmap, rsd->rsd_uid, lsd->lsd_uid,
449                            rsd->rsd_gid, lsd->lsd_gid);
450         mds_put_lsd(lsd);
451         return rc;
452 }
453
454 static int mds_init_export_data(struct ptlrpc_request *req,
455                                 struct mds_export_data *med)
456 {
457         struct obd_connect_data *data, *reply;
458         int ask_remote, ask_local;
459         ENTRY;
460
461         data = lustre_msg_buf(req->rq_reqmsg, 5, sizeof(*data));
462         reply = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*data));
463         LASSERT(data && reply);
464
465         if (med->med_initialized) {
466                 CDEBUG(D_SEC, "med already initialized, reconnect?\n");
467                 goto reply;
468         }
469
470         ask_remote = data->ocd_connect_flags & OBD_CONNECT_REMOTE;
471         ask_local = data->ocd_connect_flags & OBD_CONNECT_LOCAL;
472
473         /* currently the policy is simple: satisfy client as possible
474          * as we can.
475          */
476         if (req->rq_auth_uid == -1) {
477                 if (ask_remote)
478                         CWARN("null sec is used, force to be local\n");
479                 med->med_remote = 0;
480         } else {
481                 if (ask_remote) {
482                         if (!req->rq_remote_realm)
483                                 CWARN("local realm asked to be remote\n");
484                         med->med_remote = 1;
485                 } else if (ask_local) {
486                         if (req->rq_remote_realm)
487                                 CWARN("remote realm asked to be local\n");
488                         med->med_remote = 0;
489                 } else
490                         med->med_remote = (req->rq_remote_realm != 0);
491         }
492
493         med->med_nllu = data->ocd_nllu[0];
494         med->med_nllg = data->ocd_nllu[1];
495
496         med->med_initialized = 1;
497 reply:
498         reply->ocd_connect_flags &= ~(OBD_CONNECT_REMOTE | OBD_CONNECT_LOCAL);
499         if (med->med_remote) {
500                 if (!med->med_idmap)
501                         med->med_idmap = mds_idmap_alloc();
502
503                 if (!med->med_idmap)
504                         CERROR("Failed to alloc idmap, following request from "
505                                "this client will be refused\n");
506
507                 reply->ocd_connect_flags |= OBD_CONNECT_REMOTE;
508                 CDEBUG(D_SEC, "set client as remote\n");
509         } else {
510                 reply->ocd_connect_flags |= OBD_CONNECT_LOCAL;
511                 CDEBUG(D_SEC, "set client as local\n");
512         }
513
514         RETURN(0);
515 }
516
517 static void mds_free_export_data(struct mds_export_data *med)
518 {
519         if (!med->med_idmap)
520                 return;
521
522         LASSERT(med->med_remote);
523         mds_idmap_free(med->med_idmap);
524         med->med_idmap = NULL;
525 }
526
527 /* Establish a connection to the MDS.
528  *
529  * This will set up an export structure for the client to hold state data about
530  * that client, like open files, the last operation number it did on the server,
531  * etc.
532  */
533 static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
534                        struct obd_uuid *cluuid, struct obd_connect_data *data,
535                        unsigned long flags)
536 {
537         struct mds_export_data *med;
538         struct mds_client_data *mcd;
539         struct obd_export *exp;
540         int rc;
541         ENTRY;
542
543         if (!conn || !obd || !cluuid)
544                 RETURN(-EINVAL);
545
546         /* XXX There is a small race between checking the list and adding a new
547          * connection for the same UUID, but the real threat (list corruption
548          * when multiple different clients connect) is solved.
549          *
550          * There is a second race between adding the export to the list, and
551          * filling in the client data below.  Hence skipping the case of NULL
552          * mcd above.  We should already be controlling multiple connects at the
553          * client, and we can't hold the spinlock over memory allocations
554          * without risk of deadlocking.
555          */
556         rc = class_connect(conn, obd, cluuid);
557         if (rc)
558                 RETURN(rc);
559         exp = class_conn2export(conn);
560         
561         LASSERT(exp != NULL);
562         med = &exp->exp_mds_data;
563
564         OBD_ALLOC(mcd, sizeof(*mcd));
565         if (!mcd) {
566                 CERROR("%s: out of memory for client data.\n",
567                         obd->obd_name);
568                 GOTO(out, rc = -ENOMEM);
569         }
570
571         memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid));
572         med->med_mcd = mcd;
573
574         rc = mds_client_add(obd, &obd->u.mds, med, -1);
575         if (rc)
576                 GOTO(out, rc);
577        
578         EXIT;
579 out:
580         if (rc) {
581                 if (mcd)
582                         OBD_FREE(mcd, sizeof(*mcd));
583                 class_disconnect(exp, 0);
584         } else {
585                 class_export_put(exp);
586         }
587         return rc;
588 }
589
590 static int mds_connect_post(struct obd_export *exp, unsigned initial,
591                             unsigned long flags)
592 {
593         struct obd_device *obd = exp->exp_obd;
594         struct mds_obd *mds = &obd->u.mds;
595         struct mds_export_data *med;
596         struct mds_client_data *mcd;
597         int rc = 0;
598         ENTRY;
599
600         med = &exp->exp_mds_data;
601         mcd = med->med_mcd;
602
603         if (initial) {
604                 /* some one reconnect initially, we have to reset
605                  * data existing export can have. bug 6102 */
606                 if (mcd->mcd_last_xid != 0)
607                         CDEBUG(D_HA, "initial reconnect to existing export\n");
608                 mcd->mcd_last_transno = 0;
609                 mcd->mcd_last_xid = 0;
610                 mcd->mcd_last_close_xid = 0;
611                 mcd->mcd_last_result = 0;
612                 mcd->mcd_last_data = 0;
613         }
614
615         if (!(flags & OBD_OPT_MDS_CONNECTION)) {
616                 if (!(exp->exp_flags & OBD_OPT_REAL_CLIENT)) {
617                         atomic_inc(&mds->mds_real_clients);
618                         CDEBUG(D_OTHER,"%s: peer from %s is real client (%d)\n",
619                                obd->obd_name, exp->exp_client_uuid.uuid,
620                                atomic_read(&mds->mds_real_clients));
621                         exp->exp_flags |= OBD_OPT_REAL_CLIENT;
622                 }
623                 if (mds->mds_md_name)
624                         rc = mds_md_connect(obd, mds->mds_md_name);
625         }
626         RETURN(rc);
627 }
628
629 static int mds_init_export(struct obd_export *exp)
630 {
631         struct mds_export_data *med = &exp->exp_mds_data;
632
633         INIT_LIST_HEAD(&med->med_open_head);
634         spin_lock_init(&med->med_open_lock);
635         return 0;
636 }
637
638 static int mds_destroy_export(struct obd_export *export)
639 {
640         struct obd_device *obd = export->exp_obd;
641         struct mds_export_data *med = &export->exp_mds_data;
642         struct lvfs_run_ctxt saved;
643         int rc = 0;
644         ENTRY;
645
646         mds_free_export_data(med);
647         target_destroy_export(export);
648
649         if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
650                 GOTO(out, 0);
651
652         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
653
654         /* Close any open files (which may also cause orphan unlinking). */
655         spin_lock(&med->med_open_lock);
656         while (!list_empty(&med->med_open_head)) {
657                 struct list_head *tmp = med->med_open_head.next;
658                 struct mds_file_data *mfd =
659                         list_entry(tmp, struct mds_file_data, mfd_list);
660                 struct lustre_id sid;
661                 
662                 BDEVNAME_DECLARE_STORAGE(btmp);
663
664                 /* bug 1579: fix force-closing for 2.5 */
665                 struct dentry *dentry = mfd->mfd_dentry;
666
667                 list_del(&mfd->mfd_list);
668                 spin_unlock(&med->med_open_lock);
669
670                 down(&dentry->d_inode->i_sem);
671                 rc = mds_read_inode_sid(obd, dentry->d_inode, &sid);
672                 up(&dentry->d_inode->i_sem);
673                 if (rc) {
674                         CERROR("Can't read inode self id, inode %lu, "
675                                "rc %d\n", dentry->d_inode->i_ino, rc);
676                         memset(&sid, 0, sizeof(sid));
677                 }
678
679                 /* If you change this message, be sure to update
680                  * replay_single:test_46 */
681                 CERROR("force closing client file handle for %.*s (%s:"
682                        DLID4")\n", dentry->d_name.len, dentry->d_name.name,
683                        ll_bdevname(dentry->d_inode->i_sb, btmp),
684                        OLID4(&sid));
685                 
686                 /* child inode->i_alloc_sem protects orphan_dec_test and
687                  * is_orphan race, mds_mfd_close drops it */
688                 DOWN_WRITE_I_ALLOC_SEM(dentry->d_inode);
689                 rc = mds_mfd_close(NULL, 0, obd, mfd,
690                                    !(export->exp_flags & OBD_OPT_FAILOVER));
691                 if (rc)
692                         CDEBUG(D_INODE, "Error closing file: %d\n", rc);
693                 spin_lock(&med->med_open_lock);
694         }
695         spin_unlock(&med->med_open_lock);
696         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
697
698         EXIT;
699 out:
700         mds_client_free(export, !(export->exp_flags & OBD_OPT_FAILOVER));
701         return rc;
702 }
703
704 static int mds_disconnect(struct obd_export *exp, unsigned long flags)
705 {
706         unsigned long irqflags;
707         struct obd_device *obd;
708         struct mds_obd *mds;
709         int rc;
710         ENTRY;
711
712         LASSERT(exp != NULL);
713         obd = class_exp2obd(exp);
714         if (obd == NULL) {
715                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
716                        exp->exp_handle.h_cookie);
717                 RETURN(-EINVAL);
718         }
719         mds = &obd->u.mds;
720
721         /*
722          * suppress any inter-mds requests durring disconnecting lmv if this is
723          * detected --force mode. This is needed to avoid endless recovery.
724          */
725         if (atomic_read(&mds->mds_real_clients) > 0 &&
726             !(exp->exp_flags & OBD_OPT_REAL_CLIENT))
727                 flags |= OBD_OPT_FORCE;
728                                                                                               
729         if (!(exp->exp_flags & OBD_OPT_REAL_CLIENT)
730             && !atomic_read(&mds->mds_real_clients)) {
731                 /* there was no client at all */
732                 mds_md_disconnect(obd, flags);
733         }
734
735         if ((exp->exp_flags & OBD_OPT_REAL_CLIENT)
736             && atomic_dec_and_test(&mds->mds_real_clients)) {
737                 /* time to drop LMV connections */
738                 CDEBUG(D_OTHER, "%s: last real client %s disconnected.  "
739                        "Disconnnect from LMV now\n",
740                        obd->obd_name, exp->exp_client_uuid.uuid);
741                 mds_md_disconnect(obd, flags);
742         }
743
744         spin_lock_irqsave(&exp->exp_lock, irqflags);
745         exp->exp_flags = flags;
746         spin_unlock_irqrestore(&exp->exp_lock, irqflags);
747
748         /* disconnect early so that clients can't keep using export */
749         rc = class_disconnect(exp, flags);
750         ldlm_cancel_locks_for_export(exp);
751
752         /* complete all outstanding replies */
753         spin_lock_irqsave(&exp->exp_lock, irqflags);
754         while (!list_empty(&exp->exp_outstanding_replies)) {
755                 struct ptlrpc_reply_state *rs =
756                         list_entry(exp->exp_outstanding_replies.next,
757                                    struct ptlrpc_reply_state, rs_exp_list);
758                 struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
759
760                 spin_lock(&svc->srv_lock);
761                 list_del_init(&rs->rs_exp_list);
762                 ptlrpc_schedule_difficult_reply(rs);
763                 spin_unlock(&svc->srv_lock);
764         }
765         spin_unlock_irqrestore(&exp->exp_lock, irqflags);
766         RETURN(rc);
767 }
768
769 static int mds_getstatus(struct ptlrpc_request *req)
770 {
771         struct mds_obd *mds = mds_req2mds(req);
772         struct mds_body *body;
773         int rc, size;
774         ENTRY;
775
776         size = sizeof(*body);
777         
778         rc = lustre_pack_reply(req, 1, &size, NULL);
779         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) {
780                 CERROR("mds: out of memory for message: size=%d\n", size);
781                 req->rq_status = -ENOMEM;       /* superfluous? */
782                 RETURN(-ENOMEM);
783         }
784
785         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
786         body->valid |= OBD_MD_FID;
787         
788         memcpy(&body->id1, &mds->mds_rootid, sizeof(body->id1));
789
790         /*
791          * the last_committed and last_xid fields are filled in for all replies
792          * already - no need to do so here also.
793          */
794         RETURN(0);
795 }
796
797 int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
798                      void *data, int flag)
799 {
800         int do_ast;
801         ENTRY;
802
803         if (flag == LDLM_CB_CANCELING) {
804                 /* Don't need to do anything here. */
805                 RETURN(0);
806         }
807
808         /* XXX layering violation!  -phil */
809         lock_res_and_lock(lock);
810         
811         /*
812          * get this: if mds_blocking_ast is racing with mds_intent_policy, such
813          * that mds_blocking_ast is called just before l_i_p takes the ns_lock,
814          * then by the time we get the lock, we might not be the correct
815          * blocking function anymore.  So check, and return early, if so.
816          */
817         if (lock->l_blocking_ast != mds_blocking_ast) {
818                 unlock_res_and_lock(lock);
819                 RETURN(0);
820         }
821
822         lock->l_flags |= LDLM_FL_CBPENDING;
823         do_ast = (!lock->l_readers && !lock->l_writers);
824         unlock_res_and_lock(lock);
825
826         if (do_ast) {
827                 struct lustre_handle lockh;
828                 int rc;
829
830                 LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
831                 ldlm_lock2handle(lock, &lockh);
832                 rc = ldlm_cli_cancel(&lockh);
833                 if (rc < 0)
834                         CERROR("ldlm_cli_cancel: %d\n", rc);
835         } else {
836                 LDLM_DEBUG(lock, "Lock still has references, will be "
837                            "cancelled later");
838         }
839         RETURN(0);
840 }
841
842 static int mds_convert_md(struct obd_device *obd, struct inode *inode,
843                           void *md, int size, int mea)
844 {
845         int rc = size;
846         
847         if (S_ISREG(inode->i_mode)) {
848                 rc = mds_convert_lov_ea(obd, inode, md, size);
849         } else if (S_ISDIR(inode->i_mode)) {
850                 if (mea) {
851                         rc = mds_convert_mea_ea(obd, inode, md, size);
852                 } else {
853                         rc = mds_convert_lov_ea(obd, inode, md, size);
854                 }
855                 if (rc == -EINVAL) {
856                         CERROR("Invalid EA format (nor LOV or MEA) "
857                                "is detected. Inode %lu/%u\n",
858                                inode->i_ino, inode->i_generation);
859                 }
860         }
861         return rc;
862 }
863
864 int mds_get_md(struct obd_device *obd, struct inode *inode,
865                void *md, int *size, int lock, int mea)
866 {
867         int lmm_size;
868         int rc = 0;
869         ENTRY;
870
871         if (lock)
872                 down(&inode->i_sem);
873
874         rc = fsfilt_get_md(obd, inode, md, *size,
875                            (mea ? EA_MEA : EA_LOV));
876         if (rc < 0) {
877                 CERROR("Error %d reading eadata for ino %lu\n",
878                        rc, inode->i_ino);
879         } else if (rc > 0) {
880                 lmm_size = rc;
881                 rc = mds_convert_md(obd, inode, md,
882                                     lmm_size, mea);
883                 if (rc == 0) {
884                         *size = lmm_size;
885                         rc = lmm_size;
886                 } else if (rc > 0) {
887                         *size = rc;
888                 }
889         }
890         if (lock)
891                 up(&inode->i_sem);
892
893         RETURN(rc);
894 }
895
896 /* Call with lock=1 if you want mds_pack_md to take the i_sem.
897  * Call with lock=0 if the caller has already taken the i_sem. */
898 int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
899                 struct mds_body *body, struct inode *inode, int lock, int mea)
900 {
901         struct mds_obd *mds = &obd->u.mds;
902         int rc, lmm_size;
903         void *lmm;
904         ENTRY;
905
906         lmm = lustre_msg_buf(msg, offset, 0);
907         if (lmm == NULL) {
908                 /* Some problem with getting eadata when I sized the reply
909                  * buffer... */
910                 CDEBUG(D_INFO, "no space reserved for inode %lu MD\n",
911                        inode->i_ino);
912                 RETURN(0);
913         }
914         lmm_size = msg->buflens[offset];
915
916         /* I don't really like this, but it is a sanity check on the client
917          * MD request.  However, if the client doesn't know how much space
918          * to reserve for the MD, it shouldn't be bad to have too much space.
919          */
920         if (lmm_size > mds->mds_max_mdsize) {
921                 CWARN("Reading MD for inode %lu of %d bytes > max %d\n",
922                        inode->i_ino, lmm_size, mds->mds_max_mdsize);
923                 // RETURN(-EINVAL);
924         }
925
926         rc = mds_get_md(obd, inode, lmm, &lmm_size, lock, mea);
927         if (rc > 0) {
928                 body->valid |= S_ISDIR(inode->i_mode) ?
929                         OBD_MD_FLDIREA : OBD_MD_FLEASIZE;
930                 
931                 if (mea)
932                         body->valid |= OBD_MD_MEA;
933                 
934                 body->eadatasize = lmm_size;
935                 rc = 0;
936         }
937
938         RETURN(rc);
939 }
940
941 int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
942                   struct mds_body *repbody, int reply_off)
943 {
944         struct inode *inode = dentry->d_inode;
945         char *symname;
946         int len, rc;
947         ENTRY;
948
949         symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0);
950         LASSERT(symname != NULL);
951         len = req->rq_repmsg->buflens[reply_off + 1];
952         
953         rc = inode->i_op->readlink(dentry, symname, len);
954         if (rc < 0) {
955                 CERROR("readlink failed: %d\n", rc);
956         } else if (rc != len - 1) {
957                 CERROR ("Unexpected readlink rc %d: expecting %d\n",
958                         rc, len - 1);
959                 rc = -EINVAL;
960         } else {
961                 CDEBUG(D_INODE, "read symlink dest %s\n", symname);
962                 repbody->valid |= OBD_MD_LINKNAME;
963                 repbody->eadatasize = rc + 1;
964                 symname[rc] = 0;        /* NULL terminate */
965                 rc = 0;
966         }
967
968         RETURN(rc);
969 }
970
971 int mds_pack_xattr(struct dentry *dentry, struct ptlrpc_request *req,
972                    struct mds_body *repbody, int req_off, int reply_off)
973 {
974         struct inode *inode = dentry->d_inode;
975         char *ea_name;
976         void *value = NULL;
977         int len, rc;
978         ENTRY;
979
980         ea_name = lustre_msg_string(req->rq_reqmsg, req_off + 1, 0);
981         len = req->rq_repmsg->buflens[reply_off + 1];
982         if (len != 0)
983                 value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
984
985         rc = -EOPNOTSUPP;
986
987         if (!strcmp(ea_name, XATTR_NAME_LUSTRE_ACL)) {
988                 struct rmtacl_upcall_desc desc;
989
990                 if (len != LUSTRE_ACL_SIZE_MAX || !value) {
991                         CERROR("no reply buffer prepared\n");
992                         RETURN(-EFAULT);
993                 }
994
995                 memset(&desc, 0, sizeof(desc));
996                 desc.get = 1;
997                 desc.cmd = lustre_msg_string(req->rq_reqmsg, req_off + 2, 0);
998                 desc.cmdlen =  req->rq_reqmsg->buflens[req_off + 2];
999                 desc.res = (char *) value;
1000                 desc.reslen = LUSTRE_ACL_SIZE_MAX;
1001
1002                 mds_do_remote_acl_upcall(&desc);
1003
1004                 if (desc.upcall_status)
1005                         RETURN(desc.upcall_status);
1006
1007                 if (desc.reslen > LUSTRE_ACL_SIZE_MAX) {
1008                         CERROR("downcall claim reslen %u\n", desc.reslen);
1009                         RETURN(-EINVAL);
1010                 }
1011                 /* like remote setfacl, steal "flags" in mds_body as the
1012                  * exececution status
1013                  */
1014                 repbody->flags = desc.status;
1015                 repbody->valid |= OBD_MD_FLXATTR;
1016                 repbody->eadatasize = desc.reslen;
1017
1018                 RETURN(0);
1019         }
1020
1021         if (inode->i_op && inode->i_op->getxattr)
1022                 rc = inode->i_op->getxattr(dentry, ea_name, value, len);
1023
1024         if (rc < 0) {
1025                 if (rc != -ENODATA && rc != -EOPNOTSUPP)
1026                         CERROR("getxattr failed: %d", rc);
1027         } else {
1028                 repbody->valid |= OBD_MD_FLXATTR;
1029                 repbody->eadatasize = rc;
1030                 rc = 0;
1031         }
1032
1033         RETURN(rc);
1034 }
1035
1036 int mds_pack_xattr_list(struct dentry *dentry, struct ptlrpc_request *req,
1037                         struct mds_body *repbody, int reply_off)
1038 {
1039         struct inode *inode = dentry->d_inode;        
1040         void *value = NULL;
1041         int len, rc;
1042         ENTRY;
1043
1044         len = req->rq_repmsg->buflens[reply_off + 1];
1045         if (len != 0)
1046                 value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
1047
1048         rc = -EOPNOTSUPP;
1049         if (inode->i_op && inode->i_op->getxattr) 
1050                 rc = inode->i_op->listxattr(dentry, value, len);
1051
1052         if (rc < 0) {
1053                 CERROR("listxattr failed: %d", rc);
1054         } else {
1055                 repbody->valid |= OBD_MD_FLXATTRLIST;
1056                 repbody->eadatasize = rc;
1057                 rc = 0;
1058         }
1059         RETURN(rc);
1060 }
1061
1062 static
1063 int mds_pack_posix_acl(struct lustre_msg *repmsg, int *offset,
1064                        struct mds_body *body, struct inode *inode)
1065 {
1066         struct dentry de = { .d_inode = inode };
1067         __u32 buflen, *sizep;
1068         void *buf;
1069         int size, pack_off = *offset;
1070         ENTRY;
1071
1072         sizep = lustre_msg_buf(repmsg, pack_off++, 4);
1073         if (!sizep) {
1074                 CERROR("can't locate returned acl size buf\n");
1075                 RETURN(-EPROTO);
1076         }
1077         
1078         if (!inode->i_op->getxattr)
1079                 RETURN(0);
1080
1081         buflen = repmsg->buflens[pack_off];
1082         buf = lustre_msg_buf(repmsg, pack_off++, buflen);
1083
1084         size = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS, buf, buflen);
1085         if (size == -ENODATA || size == -EOPNOTSUPP)
1086                 RETURN(0);
1087         if (size < 0)
1088                 RETURN(size);
1089         LASSERT(size);
1090         body->valid |= OBD_MD_FLACL;
1091
1092         *sizep = cpu_to_le32(size);
1093         
1094         *offset = pack_off;
1095         RETURN(0);
1096 }
1097
1098 int mds_pack_remote_perm(struct ptlrpc_request *req, int *reply_off,
1099                          struct mds_body *body, struct inode *inode)
1100 {
1101         struct lustre_sec_desc *lsd;
1102         struct mds_remote_perm *perm;
1103         int pack_off = *reply_off;
1104         __u32 lsd_perms;
1105
1106         LASSERT(inode->i_op);
1107         LASSERT(inode->i_op->permission);
1108         LASSERT(req->rq_export->exp_mds_data.med_remote);
1109
1110         perm = (struct mds_remote_perm *)
1111                        lustre_msg_buf(req->rq_repmsg, pack_off++, sizeof(perm));
1112         if (!perm)
1113                 return -EINVAL;
1114
1115         memset(perm, 0, sizeof(*perm));
1116
1117         /* obtain authenticated uid/gid and LSD permissions, which
1118          * might be different from current process context, from LSD
1119          */
1120         lsd = mds_get_lsd(current->uid);
1121         if (!lsd) {
1122                 CWARN("can't LSD of uid %u\n", current->uid);
1123                 RETURN(-EPERM);
1124         }
1125
1126         perm->mrp_auth_uid = lsd->lsd_uid;
1127         perm->mrp_auth_gid = lsd->lsd_gid;
1128
1129         lsd_perms = mds_lsd_get_perms(lsd, 1, 0, req->rq_peer.peer_id.nid);
1130         if (lsd_perms & LSD_PERM_SETUID)
1131                 perm->mrp_allow_setuid = 1;
1132         if (lsd_perms & LSD_PERM_SETGID)
1133                 perm->mrp_allow_setgid = 1;
1134
1135         mds_put_lsd(lsd);
1136
1137         /* permission bits of current user
1138          * XXX this is low efficient, could we do it in one blow?
1139          */
1140         if (inode->i_op->permission(inode, MAY_EXEC, NULL) == 0)
1141                 perm->mrp_perm |= MAY_EXEC;
1142         if (inode->i_op->permission(inode, MAY_WRITE, NULL) == 0)
1143                 perm->mrp_perm |= MAY_WRITE;
1144         if (inode->i_op->permission(inode, MAY_READ, NULL) == 0)
1145                 perm->mrp_perm |= MAY_READ;
1146
1147         body->valid |= (OBD_MD_FLACL | OBD_MD_FLRMTACL);
1148         
1149         *reply_off = pack_off;
1150
1151         RETURN(0);
1152 }
1153
1154 int mds_pack_acl(struct ptlrpc_request *req, int *reply_off,
1155                  struct mds_body *body, struct inode *inode)
1156 {
1157         int rc;
1158
1159         if (!req->rq_export->exp_mds_data.med_remote)
1160                 rc = mds_pack_posix_acl(req->rq_repmsg, reply_off, body, inode);
1161         else
1162                 rc = mds_pack_remote_perm(req, reply_off, body, inode);
1163
1164         return rc;
1165 }
1166
1167 static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
1168                                 struct ptlrpc_request *req, int req_off,
1169                                 struct mds_body *reqbody, int reply_off,
1170                                 struct mds_req_sec_desc *rsd)
1171 {
1172         struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
1173         struct inode *inode = dentry->d_inode;
1174         struct mds_body *body;
1175         int rc = 0, offset = 0;
1176         ENTRY;
1177
1178         if (inode == NULL && !(dentry->d_flags & DCACHE_CROSS_REF))
1179                 RETURN(-ENOENT);
1180
1181         body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof(*body));
1182         LASSERT(body != NULL);                 /* caller prepped reply */
1183
1184         if (dentry->d_flags & DCACHE_CROSS_REF) {
1185                 mds_pack_dentry2body(obd, body, dentry,
1186                                      (reqbody->valid & OBD_MD_FID) ? 1 : 0);
1187                 CDEBUG(D_OTHER, "cross reference: "DLID4"\n",
1188                        OLID4(&body->id1));
1189                 RETURN(0);
1190         }
1191         
1192         mds_pack_inode2body(obd, body, inode,
1193                             (reqbody->valid & OBD_MD_FID) ? 1 : 0);
1194
1195         if ((S_ISREG(inode->i_mode) && (reqbody->valid & OBD_MD_FLEASIZE)) ||
1196             (S_ISDIR(inode->i_mode) && (reqbody->valid & OBD_MD_FLDIREA))) {
1197             
1198                 /* guessing what kind og attribute do we need. */
1199                 int is_mea = (S_ISDIR(inode->i_mode) && 
1200                     (reqbody->valid & OBD_MD_MEA) != 0);
1201                 
1202                 rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1, 
1203                                  body, inode, 1, is_mea);
1204
1205                 /* if we have LOV EA data, the OST holds size, atime, mtime. */
1206                 if (!(body->valid & OBD_MD_FLEASIZE) &&
1207                     !(body->valid & OBD_MD_FLDIREA))
1208                         body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
1209                                         OBD_MD_FLATIME | OBD_MD_FLMTIME);
1210         } else if (S_ISLNK(inode->i_mode) &&
1211                    (reqbody->valid & OBD_MD_LINKNAME) != 0) {
1212                 rc = mds_pack_link(dentry, req, body, reply_off);
1213         } else if (reqbody->valid & OBD_MD_FLXATTR) {
1214                 rc = mds_pack_xattr(dentry, req, body, req_off, reply_off);
1215         } else if (reqbody->valid & OBD_MD_FLXATTRLIST) {
1216                 rc = mds_pack_xattr_list(dentry, req, body, reply_off);
1217         }
1218         
1219         offset = reply_off + ((reqbody->valid & OBD_MD_FLEASIZE) ? 2 : 1);
1220         if (reqbody->valid & OBD_MD_FLACL) {
1221                 rc = mds_pack_acl(req, &offset, body, inode);
1222         }                
1223
1224         if (reqbody->valid & OBD_MD_FLKEY) {
1225                 rc = mds_pack_gskey(obd, req->rq_repmsg, &offset, 
1226                                     body, inode);
1227         }                
1228
1229         if (rc == 0)
1230                 mds_body_do_reverse_map(med, body);
1231
1232         RETURN(rc);
1233 }
1234
1235 static int mds_getattr_pack_msg_cf(struct ptlrpc_request *req,
1236                                    struct dentry *dentry,
1237                                    int offset)
1238 {
1239         int rc = 0, size[1] = {sizeof(struct mds_body)};
1240         ENTRY;
1241
1242         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
1243                 CERROR("failed MDS_GETATTR_PACK test\n");
1244                 req->rq_status = -ENOMEM;
1245                 RETURN(-ENOMEM);
1246         }
1247
1248         rc = lustre_pack_reply(req, 1, size, NULL);
1249         if (rc) {
1250                 CERROR("lustre_pack_reply failed: rc %d\n", rc);
1251                 GOTO(out, req->rq_status = rc);
1252         }
1253
1254         EXIT;
1255 out:
1256         return rc;
1257 }
1258
1259 static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct dentry *de,
1260                                 int offset)
1261 {
1262         struct inode *inode = de->d_inode;
1263         struct mds_obd *mds = mds_req2mds(req);
1264         struct mds_body *body;
1265         int rc = 0, size[4] = {sizeof(*body)}, bufcount = 1;
1266         ENTRY;
1267
1268         body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*body));
1269         LASSERT(body != NULL);                 /* checked by caller */
1270         LASSERT_REQSWABBED(req, offset);       /* swabbed by caller */
1271
1272         if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
1273             (S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
1274                 int rc;
1275                 
1276                 down(&inode->i_sem);
1277                 rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0,
1278                                    ((body->valid & OBD_MD_MEA) ? EA_MEA : EA_LOV));
1279                 up(&inode->i_sem);
1280                 if (rc < 0) {
1281                         if (rc != -ENODATA && rc != -EOPNOTSUPP)
1282                                 CERROR("error getting inode %lu MD: rc = %d\n",
1283                                        inode->i_ino, rc);
1284                         size[bufcount] = 0;
1285                 } else if (rc > mds->mds_max_mdsize) {
1286                         size[bufcount] = 0;
1287                         CERROR("MD size %d larger than maximum possible %u\n",
1288                                rc, mds->mds_max_mdsize);
1289                 } else {
1290                         size[bufcount] = rc;
1291                 }
1292                 bufcount++;
1293         } else if (S_ISLNK(inode->i_mode) && (body->valid & OBD_MD_LINKNAME)) {
1294                 if (inode->i_size + 1 != body->eadatasize)
1295                         CERROR("symlink size: %Lu, reply space: %d\n",
1296                                inode->i_size + 1, body->eadatasize);
1297                 size[bufcount] = min_t(int, inode->i_size+1, body->eadatasize);
1298                 bufcount++;
1299                 CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n",
1300                        inode->i_size + 1, body->eadatasize);
1301         } else if ((body->valid & OBD_MD_FLXATTR)) {
1302                 char *ea_name = lustre_msg_string(req->rq_reqmsg, 
1303                                                   offset + 1, 0);
1304                 rc = -EOPNOTSUPP;
1305
1306                 if (!strcmp(ea_name, XATTR_NAME_LUSTRE_ACL)) {
1307                         size[bufcount] = LUSTRE_ACL_SIZE_MAX;
1308                 } else {
1309                         if (inode->i_op && inode->i_op->getxattr)
1310                                 rc = inode->i_op->getxattr(de, ea_name,
1311                                                            NULL, 0);
1312
1313                         if (rc < 0) {
1314                                 if (rc != -ENODATA && rc != -EOPNOTSUPP)
1315                                         CERROR("error get inode %lu EA: %d\n",
1316                                                inode->i_ino, rc);
1317                                 size[bufcount] = 0;
1318                         } else {
1319                                 size[bufcount] = min_t(int,
1320                                                        body->eadatasize, rc);
1321                         }
1322                 }
1323                 bufcount++;
1324         } else if (body->valid & OBD_MD_FLXATTRLIST) {
1325                 rc = -EOPNOTSUPP;
1326                 if (inode->i_op && inode->i_op->getxattr) 
1327                         rc = inode->i_op->listxattr(de, NULL, 0);
1328
1329                 if (rc < 0) {
1330                         if (rc != -ENODATA && rc != -EOPNOTSUPP)
1331                                 CERROR("error getting inode %lu EA: rc = %d\n",
1332                                        inode->i_ino, rc);
1333                         size[bufcount] = 0;
1334                 } else {
1335                         size[bufcount] = min_t(int, body->eadatasize, rc);
1336                 }
1337                 bufcount++;
1338         }
1339         
1340         /* may co-exist with OBD_MD_FLEASIZE */
1341         if (body->valid & OBD_MD_FLACL) {
1342                 if (req->rq_export->exp_mds_data.med_remote) {
1343                         size[bufcount++] = sizeof(struct mds_remote_perm);
1344                 } else {
1345                         size[bufcount++] = sizeof(int);
1346                         size[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
1347                 }
1348         }
1349
1350         if (body->valid & OBD_MD_FLKEY) {
1351                 size[bufcount++] = sizeof(int);
1352                 size[bufcount++] = sizeof(struct crypto_key);
1353         }
1354
1355         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
1356                 CERROR("failed MDS_GETATTR_PACK test\n");
1357                 req->rq_status = -ENOMEM;
1358                 GOTO(out, rc = -ENOMEM);
1359         }
1360
1361         rc = lustre_pack_reply(req, bufcount, size, NULL);
1362         if (rc) {
1363                 CERROR("out of memory\n");
1364                 GOTO(out, req->rq_status = rc);
1365         }
1366
1367         EXIT;
1368  out:
1369         return rc;
1370 }
1371
1372 int mds_check_mds_num(struct obd_device *obd, struct inode *inode,
1373                       char *name, int namelen)
1374 {
1375         struct mea *mea = NULL;
1376         int mea_size, rc = 0;
1377         ENTRY;
1378         
1379         rc = mds_md_get_attr(obd, inode, &mea, &mea_size);
1380         if (rc)
1381                 RETURN(rc);
1382         if (mea != NULL) {
1383                 /*
1384                  * dir is already splitted, check if requested filename should
1385                  * live at this MDS or at another one.
1386                  */
1387                 int i = mea_name2idx(mea, name, namelen - 1);
1388                 if (mea->mea_master != id_group(&mea->mea_ids[i])) {
1389                         CDEBUG(D_OTHER,
1390                                "inapropriate MDS(%d) for %s. should be "
1391                                "%lu(%d)\n", mea->mea_master, name, 
1392                                (unsigned long)id_group(&mea->mea_ids[i]), i);
1393                         rc = -ERESTART;
1394                 }
1395         }
1396
1397         if (mea)
1398                 OBD_FREE(mea, mea_size);
1399         RETURN(rc);
1400 }
1401
1402 int mds_getattr_size(struct obd_device *obd, struct dentry *dentry,
1403                      struct ptlrpc_request *req, struct mds_body *body)
1404 {
1405         struct inode *inode = dentry->d_inode;
1406         ENTRY;
1407
1408         LASSERT(body != NULL);
1409
1410         if (dentry->d_inode == NULL || !S_ISREG(inode->i_mode))
1411                 RETURN(0);
1412         
1413         if (obd->obd_recovering) {
1414                 CDEBUG(D_INODE, "size for "DLID4" is unknown yet (recovering)\n",
1415                        OLID4(&body->id1));
1416                 RETURN(0);
1417         }
1418
1419         if (atomic_read(&inode->i_writecount)) {
1420                 /* some one has opened the file for write.
1421                  * mds doesn't know actual size */
1422                 CDEBUG(D_INODE, "MDS doesn't know actual size for "DLID4"\n",
1423                        OLID4(&body->id1));
1424                 RETURN(0);
1425         }
1426         CDEBUG(D_INODE, "MDS returns "LPD64"/"LPD64" for"DLID4"\n",
1427                body->size, body->blocks, OLID4(&body->id1));
1428         body->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
1429         RETURN(0);
1430 }
1431
1432 static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
1433                             struct lustre_handle *child_lockh, int child_part)
1434 {
1435         struct obd_device *obd = req->rq_export->exp_obd;
1436         struct mds_obd *mds = &obd->u.mds;
1437         struct ldlm_reply *rep = NULL;
1438         struct lvfs_run_ctxt saved;
1439         struct mds_req_sec_desc *rsd;
1440         struct mds_body *body;
1441         struct dentry *dparent = NULL, *dchild = NULL;
1442         struct lvfs_ucred uc = {NULL, NULL,};
1443         struct lustre_handle parent_lockh[2] = {{0}, {0}};
1444         unsigned int namesize;
1445         int rc = 0, cleanup_phase = 0, resent_req = 0, update_mode, reply_offset;
1446         char *name = NULL;
1447         ENTRY;
1448
1449         LASSERT(!strcmp(obd->obd_type->typ_name, OBD_MDS_DEVICENAME));
1450         MD_COUNTER_INCREMENT(obd, getattr_lock);
1451
1452         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1453         if (!rsd) {
1454                 CERROR("Can't unpack security desc\n");
1455                 RETURN(-EFAULT);
1456         }
1457
1458         /* swab now, before anyone looks inside the request. */
1459         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1460                                   lustre_swab_mds_body);
1461         if (body == NULL) {
1462                 CERROR("Can't swab mds_body\n");
1463                 GOTO(cleanup, rc = -EFAULT);
1464         }
1465
1466         LASSERT_REQSWAB(req, offset + 1);
1467         name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0);
1468         if (name == NULL) {
1469                 CERROR("Can't unpack name\n");
1470                 GOTO(cleanup, rc = -EFAULT);
1471         }
1472         namesize = req->rq_reqmsg->buflens[offset + 1];
1473
1474         /* namesize less than 2 means we have empty name, probably came from
1475            revalidate by cfid, so no point in having name to be set */
1476         if (namesize <= 1)
1477                 name = NULL;
1478
1479         LASSERT (offset == 1 || offset == 3);
1480         if (offset == 3) {
1481                 rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
1482                 reply_offset = 1;
1483         } else {
1484                 reply_offset = 0;
1485         }
1486
1487         rc = mds_init_ucred(&uc, req, rsd);
1488         if (rc) {
1489                 GOTO(cleanup, rc);
1490         }
1491
1492         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1493         cleanup_phase = 1; /* kernel context */
1494         intent_set_disposition(rep, DISP_LOOKUP_EXECD);
1495
1496         LASSERT(namesize > 0);
1497         if (child_lockh->cookie != 0) {
1498                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
1499                 resent_req = 1;
1500         }
1501 #if HAVE_LOOKUP_RAW
1502         if (body->valid == OBD_MD_FLID) {
1503                 struct mds_body *mds_reply;
1504                 int size = sizeof(*mds_reply);
1505                 struct inode *dir;
1506                 ino_t inum;
1507
1508                 dparent = mds_id2dentry(obd, &body->id1, NULL);
1509                 if (IS_ERR(dparent)) {
1510                         rc = PTR_ERR(dparent);
1511                         GOTO(cleanup, rc);
1512                 }
1513                 /*
1514                  * the user requested ONLY the inode number, so do a raw lookup.
1515                  */
1516                 rc = lustre_pack_reply(req, 1, &size, NULL);
1517                 if (rc) {
1518                         CERROR("out of memory\n");
1519                         l_dput(dparent);
1520                         GOTO(cleanup, rc);
1521                 }
1522                 dir  = dparent->d_inode;
1523                 LASSERT(dir->i_op->lookup_raw != NULL);
1524                 rc = dir->i_op->lookup_raw(dir, name, namesize - 1, &inum);
1525                 l_dput(dparent);
1526                 mds_reply = lustre_msg_buf(req->rq_repmsg, 0,
1527                                            sizeof(*mds_reply));
1528
1529                 id_ino(&mds_reply->id1) = inum;
1530                 mds_reply->valid = OBD_MD_FLID;
1531                 GOTO(cleanup, rc);
1532         }
1533 #endif
1534         if (resent_req == 0) {
1535                 LASSERT(id_fid(&body->id1) != 0);
1536                 if (name) {
1537                         rc = mds_get_parent_child_locked(obd, mds, &body->id1,
1538                                                          parent_lockh, &dparent,
1539                                                          LCK_PR, 
1540                                                          MDS_INODELOCK_UPDATE,
1541                                                          &update_mode, 
1542                                                          name, namesize,
1543                                                          child_lockh, &dchild, 
1544                                                          LCK_PR, child_part);
1545                         if (rc)
1546                                 GOTO(cleanup, rc);
1547                 
1548                         cleanup_phase = 2; /* dchild, dparent, locks */
1549                         
1550                         /*
1551                          * let's make sure this name should leave on this mds
1552                          * node.
1553                          */
1554                         rc = mds_check_mds_num(obd, dparent->d_inode, name, namesize);
1555                         if (rc)
1556                                 GOTO(cleanup, rc);
1557                 } else {
1558                         /* we have no dentry here, drop LOOKUP bit */
1559                         /* FIXME: we need MDS_INODELOCK_LOOKUP or not. */
1560                         child_part &= ~MDS_INODELOCK_LOOKUP;
1561                         CDEBUG(D_OTHER, "%s: retrieve attrs for "DLID4"\n",
1562                                obd->obd_name, OLID4(&body->id1));
1563
1564                         dchild = mds_id2locked_dentry(obd, &body->id1, NULL,
1565                                                       LCK_PR, parent_lockh,
1566                                                       &update_mode,
1567                                                       NULL, 0, 
1568                                                       MDS_INODELOCK_UPDATE);
1569                         if (IS_ERR(dchild)) {
1570                                 CERROR("can't find inode with id "DLID4", err = %d\n", 
1571                                        OLID4(&body->id1), (int)PTR_ERR(dchild));
1572                                 GOTO(cleanup, rc = PTR_ERR(dchild));
1573                         }
1574                         memcpy(child_lockh, parent_lockh, sizeof(parent_lockh[0]));
1575                 }
1576         } else {
1577                 struct ldlm_lock *granted_lock;
1578
1579                 DEBUG_REQ(D_DLMTRACE, req, "resent, not enqueuing new locks");
1580                 granted_lock = ldlm_handle2lock(child_lockh);
1581
1582                 LASSERTF(granted_lock != NULL, LPU64"/%lu lockh "LPX64"\n",
1583                          id_fid(&body->id1), (unsigned long)id_group(&body->id1),
1584                          child_lockh->cookie);
1585
1586                 if (name) {
1587                         /* usual named request */
1588                         dparent = mds_id2dentry(obd, &body->id1, NULL);
1589                         LASSERT(!IS_ERR(dparent));
1590                         dchild = ll_lookup_one_len(name, dparent, namesize - 1);
1591                         LASSERT(!IS_ERR(dchild));
1592                 } else {
1593                         /* client wants to get attr. by id */
1594                         dchild = mds_id2dentry(obd, &body->id1, NULL);
1595                         LASSERT(!IS_ERR(dchild));
1596                 }
1597                 LDLM_LOCK_PUT(granted_lock);
1598         }
1599
1600         cleanup_phase = 2; /* dchild, dparent, locks */
1601
1602         if (!DENTRY_VALID(dchild)) {
1603                 intent_set_disposition(rep, DISP_LOOKUP_NEG);
1604                 /*
1605                  * in the intent case, the policy clears this error: the
1606                  * disposition is enough.
1607                  */
1608                 rc = -ENOENT;
1609                 GOTO(cleanup, rc);
1610         } else {
1611                 intent_set_disposition(rep, DISP_LOOKUP_POS);
1612         }
1613
1614         if (req->rq_repmsg == NULL) {
1615                 if (dchild->d_flags & DCACHE_CROSS_REF)
1616                         rc = mds_getattr_pack_msg_cf(req, dchild, offset);
1617                 else
1618                         rc = mds_getattr_pack_msg(req, dchild, offset);
1619                 if (rc != 0) {
1620                         CERROR ("mds_getattr_pack_msg: %d\n", rc);
1621                         GOTO (cleanup, rc);
1622                 }
1623         }
1624
1625         rc = mds_getattr_internal(obd, dchild, req, offset, body,
1626                                   reply_offset, rsd);
1627         if (rc)
1628                 GOTO(cleanup, rc); /* returns the lock to the client */
1629
1630         /* probably MDS knows actual size? */
1631         body = lustre_msg_buf(req->rq_repmsg, reply_offset, sizeof(*body));
1632         LASSERT(body != NULL);
1633         mds_getattr_size(obd, dchild, req, body);
1634
1635         GOTO(cleanup, rc);
1636
1637  cleanup:
1638         switch (cleanup_phase) {
1639         case 2:
1640                 if (resent_req == 0) {
1641                         if (rc && DENTRY_VALID(dchild))
1642                                 ldlm_lock_decref(child_lockh, LCK_PR);
1643                         if (name)
1644                                 ldlm_lock_decref(parent_lockh, LCK_PR);
1645 #ifdef S_PDIROPS
1646                         if (parent_lockh[1].cookie != 0)
1647                                 ldlm_lock_decref(parent_lockh + 1, update_mode);
1648 #endif
1649                         if (dparent)
1650                                 l_dput(dparent);
1651                 }
1652                 l_dput(dchild);
1653         case 1:
1654                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1655         default:
1656                 mds_exit_ucred(&uc);
1657         }
1658         return rc;
1659 }
1660
1661 static int mds_getattr(struct ptlrpc_request *req, int offset)
1662 {
1663         struct obd_device *obd = req->rq_export->exp_obd;
1664         struct lvfs_run_ctxt saved;
1665         struct dentry *de;
1666         struct mds_req_sec_desc *rsd;
1667         struct mds_body *body;
1668         struct lvfs_ucred uc = {NULL, NULL,};
1669         int rc = 0;
1670         ENTRY;
1671
1672         MD_COUNTER_INCREMENT(obd, getattr);
1673
1674         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1675         if (!rsd) {
1676                 CERROR("Can't unpack security desc\n");
1677                 RETURN(-EFAULT);
1678         }
1679
1680         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1681                                   lustre_swab_mds_body);
1682         if (body == NULL) {
1683                 CERROR ("Can't unpack body\n");
1684                 RETURN (-EFAULT);
1685         }
1686
1687         rc = mds_init_ucred(&uc, req, rsd);
1688         if (rc) {
1689                 mds_exit_ucred(&uc);
1690                 RETURN(rc);
1691         }
1692
1693         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1694         de = mds_id2dentry(obd, &body->id1, NULL);
1695         if (IS_ERR(de)) {
1696                 rc = req->rq_status = PTR_ERR(de);
1697                 GOTO(out_pop, rc);
1698         }
1699
1700         rc = mds_getattr_pack_msg(req, de, offset);
1701         if (rc != 0) {
1702                 CERROR("mds_getattr_pack_msg: %d\n", rc);
1703                 GOTO(out_pop, rc);
1704         }
1705
1706         req->rq_status = mds_getattr_internal(obd, de, req, offset, body,
1707                                               0, rsd);
1708         l_dput(de);
1709
1710         EXIT;
1711 out_pop:
1712         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1713         mds_exit_ucred(&uc);
1714         return rc;
1715 }
1716 static int mds_access_check(struct ptlrpc_request *req, int offset)
1717 {
1718         struct obd_device *obd = req->rq_export->exp_obd;
1719         struct lvfs_run_ctxt saved;
1720         struct dentry *de;
1721         struct mds_req_sec_desc *rsd;
1722         struct mds_body *body;
1723         struct lvfs_ucred uc;
1724         int rep_size[2] = {sizeof(*body),
1725                            sizeof(struct mds_remote_perm)};
1726         int rc = 0, rep_offset;
1727         ENTRY;
1728
1729         if (!req->rq_export->exp_mds_data.med_remote) {
1730                 CERROR("from local client "LPU64"\n", req->rq_peer.peer_id.nid);
1731                 RETURN(-EINVAL);
1732         }
1733
1734         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1735         if (!rsd) {
1736                 CERROR("Can't unpack security desc\n");
1737                 RETURN(-EFAULT);
1738         }
1739
1740         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1741                                   lustre_swab_mds_body);
1742         if (body == NULL) {
1743                 CERROR ("Can't unpack body\n");
1744                 RETURN (-EFAULT);
1745         }
1746
1747         MD_COUNTER_INCREMENT(obd, access_check);
1748
1749         rc = mds_init_ucred(&uc, req, rsd);
1750         if (rc) {
1751                 CERROR("init ucred error: %d\n", rc);
1752                 RETURN(rc);
1753         }
1754         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1755
1756         de = mds_id2dentry(obd, &body->id1, NULL);
1757         if (IS_ERR(de)) {
1758                 CERROR("grab ino "LPU64": err %ld\n",
1759                        body->id1.li_stc.u.e3s.l3s_ino, PTR_ERR(de));
1760                 GOTO(out_pop, rc = PTR_ERR(de));
1761         }
1762
1763         rc = lustre_pack_reply(req, 2, rep_size, NULL);
1764         if (rc) {
1765                 CERROR("pack reply error: %d\n", rc);
1766                 GOTO(out_dput, rc = -EINVAL);
1767         }
1768
1769         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
1770         LASSERT(body);
1771
1772         rep_offset = 1;
1773         rc = mds_pack_remote_perm(req, &rep_offset, body, de->d_inode);
1774
1775         EXIT;
1776
1777 out_dput:
1778         l_dput(de);
1779 out_pop:
1780         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1781         mds_exit_ucred(&uc);
1782         return rc;
1783 }
1784
1785 static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
1786                           unsigned long max_age)
1787 {
1788         int rc;
1789         ENTRY;
1790
1791         spin_lock(&obd->obd_osfs_lock);
1792         rc = fsfilt_statfs(obd, obd->u.mds.mds_sb, max_age);
1793         if (rc == 0)
1794                 memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
1795         spin_unlock(&obd->obd_osfs_lock);
1796
1797         RETURN(rc);
1798 }
1799
1800 static int mds_statfs(struct ptlrpc_request *req)
1801 {
1802         struct obd_device *obd = req->rq_export->exp_obd;
1803         int rc, size = sizeof(struct obd_statfs);
1804         ENTRY;
1805
1806         /* This will trigger a watchdog timeout */
1807         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
1808                          (MDS_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1);
1809
1810         rc = lustre_pack_reply(req, 1, &size, NULL);
1811         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_STATFS_PACK)) {
1812                 CERROR("mds: statfs lustre_pack_reply failed: rc = %d\n", rc);
1813                 GOTO(out, rc);
1814         }
1815
1816         OBD_COUNTER_INCREMENT(obd, statfs);
1817
1818         /* We call this so that we can cache a bit - 1 jiffie worth */
1819         rc = mds_obd_statfs(obd, lustre_msg_buf(req->rq_repmsg, 0, size),
1820                             jiffies - HZ);
1821         if (rc) {
1822                 CERROR("mds_obd_statfs failed: rc %d\n", rc);
1823                 GOTO(out, rc);
1824         }
1825
1826         EXIT;
1827 out:
1828         req->rq_status = rc;
1829         return rc;
1830 }
1831
1832 static int mds_sync(struct ptlrpc_request *req, int offset)
1833 {
1834         struct obd_device *obd = req->rq_export->exp_obd;
1835         struct mds_obd *mds = &obd->u.mds;
1836         struct mds_body *body;
1837         int rc, size = sizeof(*body);
1838         ENTRY;
1839
1840         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1841                                   lustre_swab_mds_body);
1842         if (body == NULL)
1843                 GOTO(out, rc = -EPROTO);
1844
1845         rc = lustre_pack_reply(req, 1, &size, NULL);
1846         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_SYNC_PACK)) {
1847                 CERROR("fsync lustre_pack_reply failed: rc = %d\n", rc);
1848                 GOTO(out, rc);
1849         }
1850
1851         if (id_ino(&body->id1) == 0) {
1852                 /* an id of zero is taken to mean "sync whole filesystem" */
1853                 rc = fsfilt_sync(obd, mds->mds_sb);
1854                 if (rc)
1855                         GOTO(out, rc);
1856         } else {
1857                 /* just any file to grab fsync method - "file" arg unused */
1858                 struct file *file = mds->mds_rcvd_filp;
1859                 struct mds_body *rep_body;
1860                 struct dentry *de;
1861
1862                 de = mds_id2dentry(obd, &body->id1, NULL);
1863                 if (IS_ERR(de))
1864                         GOTO(out, rc = PTR_ERR(de));
1865
1866                 rc = file->f_op->fsync(NULL, de, 1);
1867                 if (rc)
1868                         GOTO(out, rc);
1869
1870                 rep_body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep_body));
1871                 mds_pack_inode2body(obd, rep_body, de->d_inode,
1872                                     (body->valid & OBD_MD_FID) ? 1 : 0);
1873                 l_dput(de);
1874         }
1875
1876         EXIT;
1877 out:
1878         req->rq_status = rc;
1879         return rc;
1880 }
1881
1882 /* mds_readpage does not take a DLM lock on the inode, because the client must
1883  * already have a PR lock.
1884  *
1885  * If we were to take another one here, a deadlock will result, if another
1886  * thread is already waiting for a PW lock. */
1887 static int mds_readpage(struct ptlrpc_request *req, int offset)
1888 {
1889         struct obd_device *obd = req->rq_export->exp_obd;
1890         struct vfsmount *mnt;
1891         struct dentry *de;
1892         struct file *file;
1893         struct mds_req_sec_desc *rsd;
1894         struct mds_body *body, *repbody;
1895         struct lvfs_run_ctxt saved;
1896         int rc, size = sizeof(*repbody);
1897         struct lvfs_ucred uc = {NULL, NULL,};
1898         ENTRY;
1899
1900         rc = lustre_pack_reply(req, 1, &size, NULL);
1901         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_READPAGE_PACK)) {
1902                 CERROR("mds: out of memory\n");
1903                 GOTO(out, rc = -ENOMEM);
1904         }
1905
1906         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1907         if (!rsd) {
1908                 CERROR("Can't unpack security desc\n");
1909                 GOTO (out, rc = -EFAULT);
1910         }
1911
1912         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1913                                   lustre_swab_mds_body);
1914         if (body == NULL) {
1915                 CERROR("Can't unpack body\n");
1916                 GOTO (out, rc = -EFAULT);
1917         }
1918
1919         rc = mds_init_ucred(&uc, req, rsd);
1920         if (rc) {
1921                 GOTO(out, rc);
1922         }
1923
1924         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1925         de = mds_id2dentry(obd, &body->id1, &mnt);
1926         if (IS_ERR(de))
1927                 GOTO(out_pop, rc = PTR_ERR(de));
1928
1929         CDEBUG(D_INODE, "ino %lu\n", de->d_inode->i_ino);
1930
1931         file = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE);
1932         /* note: in case of an error, dentry_open puts dentry */
1933         if (IS_ERR(file))
1934                 GOTO(out_pop, rc = PTR_ERR(file));
1935
1936         /* body->size is actually the offset -eeb */
1937         if ((body->size & (de->d_inode->i_blksize - 1)) != 0) {
1938                 CERROR("offset "LPU64" not on a block boundary of %lu\n",
1939                        body->size, de->d_inode->i_blksize);
1940                 GOTO(out_file, rc = -EFAULT);
1941         }
1942
1943         /* body->nlink is actually the #bytes to read -eeb */
1944         if (body->nlink & (de->d_inode->i_blksize - 1)) {
1945                 CERROR("size %u is not multiple of blocksize %lu\n",
1946                        body->nlink, de->d_inode->i_blksize);
1947                 GOTO(out_file, rc = -EFAULT);
1948         }
1949
1950         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
1951         repbody->size = file->f_dentry->d_inode->i_size;
1952         repbody->valid = OBD_MD_FLSIZE;
1953
1954         /* to make this asynchronous make sure that the handling function
1955            doesn't send a reply when this function completes. Instead a
1956            callback function would send the reply */
1957         /* body->size is actually the offset -eeb */
1958         rc = mds_sendpage(req, file, body->size, body->nlink);
1959
1960         EXIT;
1961 out_file:
1962         filp_close(file, 0);
1963 out_pop:
1964         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1965 out:
1966         mds_exit_ucred(&uc);
1967         req->rq_status = rc;
1968         return 0;
1969 }
1970
1971 /* update master MDS ID, which is stored in local inode EA. */
1972 int mds_update_mid(struct obd_device *obd, struct lustre_id *id,
1973                    void *data, int data_len)
1974 {
1975         struct mds_obd *mds = &obd->u.mds;
1976         struct dentry *dentry;
1977         void *handle;
1978         int rc = 0;
1979         ENTRY;
1980
1981         LASSERT(id);
1982         LASSERT(obd);
1983         
1984         dentry = mds_id2dentry(obd, id, NULL);
1985         if (IS_ERR(dentry))
1986                 GOTO(out, rc = PTR_ERR(dentry));
1987
1988         if (!dentry->d_inode) {
1989                 CERROR("Can't find object "DLID4".\n",
1990                        OLID4(id));
1991                 GOTO(out_dentry, rc = -EINVAL);
1992         }
1993
1994         handle = fsfilt_start(obd, dentry->d_inode,
1995                               FSFILT_OP_SETATTR, NULL);
1996         if (IS_ERR(handle))
1997                 GOTO(out_dentry, rc = PTR_ERR(handle));
1998
1999         rc = mds_update_inode_mid(obd, dentry->d_inode, handle,
2000                                   (struct lustre_id *)data);
2001         if (rc) {
2002                 CERROR("Can't update inode "DLID4" master id, "
2003                        "error = %d.\n", OLID4(id), rc);
2004                 GOTO(out_commit, rc);
2005         }
2006
2007         EXIT;
2008 out_commit:
2009         fsfilt_commit(obd, mds->mds_sb, dentry->d_inode,
2010                       handle, 0);
2011 out_dentry:
2012         l_dput(dentry);
2013 out:
2014         return rc;
2015 }
2016 EXPORT_SYMBOL(mds_update_mid);
2017
2018 /* read master MDS ID, which is stored in local inode EA. */
2019 int mds_read_mid(struct obd_device *obd, struct lustre_id *id,
2020                  void *data, int data_len)
2021 {
2022         struct dentry *dentry;
2023         int rc = 0;
2024         ENTRY;
2025
2026         LASSERT(id);
2027         LASSERT(obd);
2028         
2029         dentry = mds_id2dentry(obd, id, NULL);
2030         if (IS_ERR(dentry))
2031                 GOTO(out, rc = PTR_ERR(dentry));
2032
2033         if (!dentry->d_inode) {
2034                 CERROR("Can't find object "DLID4".\n",
2035                        OLID4(id));
2036                 GOTO(out_dentry, rc = -EINVAL);
2037         }
2038
2039         down(&dentry->d_inode->i_sem);
2040         rc = mds_read_inode_mid(obd, dentry->d_inode,
2041                                 (struct lustre_id *)data);
2042         up(&dentry->d_inode->i_sem);
2043         if (rc) {
2044                 CERROR("Can't read inode "DLID4" master id, "
2045                        "error = %d.\n", OLID4(id), rc);
2046                 GOTO(out_dentry, rc);
2047         }
2048
2049         EXIT;
2050 out_dentry:
2051         l_dput(dentry);
2052 out:
2053         return rc;
2054 }
2055 EXPORT_SYMBOL(mds_read_mid);
2056
2057 int mds_read_md(struct obd_device *obd, struct lustre_id *id, 
2058                 char **data, int *datalen)
2059 {
2060         struct dentry *dentry;
2061         struct mds_obd *mds = &obd->u.mds;
2062         int rc = 0, mea = 0;
2063         char *ea;
2064         ENTRY;
2065
2066         LASSERT(id);
2067         LASSERT(obd);
2068         
2069         dentry = mds_id2dentry(obd, id, NULL);
2070         if (IS_ERR(dentry))
2071                 GOTO(out, rc = PTR_ERR(dentry));
2072
2073         if (!dentry->d_inode) {
2074                 CERROR("Can't find object "DLID4".\n",
2075                        OLID4(id));
2076                 GOTO(out_dentry, rc = -EINVAL);
2077         }
2078         if (S_ISDIR(dentry->d_inode->i_mode)) {
2079                 *datalen = obd_packmd(mds->mds_md_exp, NULL, NULL);
2080                 mea = 1; 
2081         } else {
2082                 *datalen = obd_packmd(mds->mds_dt_exp, NULL, NULL); 
2083                 mea = 0;
2084         }
2085         OBD_ALLOC(ea, *datalen);
2086         if (!ea) {
2087                 *datalen = 0;
2088                 GOTO(out_dentry, rc = PTR_ERR(dentry));
2089         } 
2090         *data = ea;
2091         down(&dentry->d_inode->i_sem);
2092         rc = fsfilt_get_md(obd, dentry->d_inode, *data, *datalen,
2093                            (mea ? EA_MEA : EA_LOV));
2094         up(&dentry->d_inode->i_sem);
2095         
2096         if (rc < 0) 
2097                 CERROR("Error %d reading eadata for ino %lu\n",
2098                         rc, dentry->d_inode->i_ino);
2099 out_dentry:
2100         l_dput(dentry);
2101 out:
2102         RETURN(rc);
2103 }
2104 EXPORT_SYMBOL(mds_read_md);
2105
2106 int mds_reint(struct ptlrpc_request *req, int offset,
2107               struct lustre_handle *lockh)
2108 {
2109         struct mds_update_record *rec;
2110         struct mds_req_sec_desc *rsd;
2111         int rc;
2112         ENTRY;
2113
2114         OBD_ALLOC(rec, sizeof(*rec));
2115         if (rec == NULL)
2116                 RETURN(-ENOMEM);
2117
2118         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
2119         if (!rsd) {
2120                 CERROR("Can't unpack security desc\n");
2121                 GOTO(out, rc = -EFAULT);
2122         }
2123
2124         rc = mds_update_unpack(req, offset, rec);
2125         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) {
2126                 CERROR("invalid record\n");
2127                 GOTO(out, req->rq_status = -EINVAL);
2128         }
2129
2130         rc = mds_init_ucred(&rec->ur_uc, req, rsd);
2131         if (rc) {
2132                 GOTO(out, rc);
2133         }
2134
2135         /* rc will be used to interrupt a for loop over multiple records */
2136         rc = mds_reint_rec(rec, offset, req, lockh);
2137
2138  out:
2139         mds_exit_ucred(&rec->ur_uc);
2140         OBD_FREE(rec, sizeof(*rec));
2141         RETURN(rc);
2142 }
2143
2144 static int mds_filter_recovery_request(struct ptlrpc_request *req,
2145                                        struct obd_device *obd, int *process)
2146 {
2147         switch (req->rq_reqmsg->opc) {
2148         case MDS_CONNECT: /* This will never get here, but for completeness. */
2149         case OST_CONNECT: /* This will never get here, but for completeness. */
2150         case MDS_DISCONNECT:
2151         case OST_DISCONNECT:
2152                *process = 1;
2153                RETURN(0);
2154
2155         case MDS_CLOSE:
2156         case MDS_SYNC: /* used in unmounting */
2157         case OBD_PING:
2158         case MDS_REINT:
2159         case LDLM_ENQUEUE:
2160         case OST_CREATE:
2161                 *process = target_queue_recovery_request(req, obd);
2162                 RETURN(0);
2163
2164         default:
2165                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
2166                 *process = 0;
2167                 /* XXX what should we set rq_status to here? */
2168                 req->rq_status = -EAGAIN;
2169                 RETURN(ptlrpc_error(req));
2170         }
2171 }
2172
2173 static char *reint_names[] = {
2174         [REINT_SETATTR] "setattr",
2175         [REINT_CREATE]  "create",
2176         [REINT_LINK]    "link",
2177         [REINT_UNLINK]  "unlink",
2178         [REINT_RENAME]  "rename",
2179         [REINT_OPEN]    "open",
2180 };
2181
2182 #define FILTER_VALID_FLAGS (OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLGENER  | \
2183                             OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ| \
2184                             OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME| \
2185                             OBD_MD_FLID) 
2186
2187 static void reconstruct_create(struct ptlrpc_request *req)
2188 {
2189         struct mds_export_data *med = &req->rq_export->exp_mds_data;
2190         struct mds_client_data *mcd = med->med_mcd;
2191         struct dentry *dentry;
2192         struct ost_body *body;
2193         struct lustre_id id;
2194         int rc;
2195         ENTRY;
2196
2197         /* copy rc, transno and disp; steal locks */
2198         mds_req_from_mcd(req, mcd);
2199         if (req->rq_status) {
2200                 EXIT;
2201                 return;
2202         }
2203
2204         id_gen(&id) = 0;
2205         id_group(&id) = 0;
2206
2207         id_ino(&id) = mcd->mcd_last_data;
2208         LASSERT(id_ino(&id) != 0);
2209
2210         dentry = mds_id2dentry(req2obd(req), &id, NULL);
2211         if (IS_ERR(dentry)) {
2212                 CERROR("can't find inode "LPU64"\n", id_ino(&id));
2213                 req->rq_status = PTR_ERR(dentry);
2214                 EXIT;
2215                 return;
2216         }
2217
2218         CWARN("reconstruct reply for x"LPU64" (remote ino) "LPU64" -> %lu/%u\n",
2219               req->rq_xid, id_ino(&id), dentry->d_inode->i_ino,
2220               dentry->d_inode->i_generation);
2221
2222         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
2223         obdo_from_inode(&body->oa, dentry->d_inode, FILTER_VALID_FLAGS);
2224         body->oa.o_id = dentry->d_inode->i_ino;
2225         body->oa.o_generation = dentry->d_inode->i_generation;
2226         body->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER;
2227
2228         down(&dentry->d_inode->i_sem);
2229         rc = mds_read_inode_sid(req2obd(req), dentry->d_inode, &id);
2230         up(&dentry->d_inode->i_sem);
2231         if (rc) {
2232                 CERROR("Can't read inode self id, inode %lu, "
2233                        "rc %d\n", dentry->d_inode->i_ino, rc);
2234                 id_fid(&id) = 0;
2235         }
2236
2237         body->oa.o_fid = id_fid(&id);
2238         body->oa.o_mds = id_group(&id);
2239         l_dput(dentry);
2240
2241         EXIT;
2242 }
2243
2244 static int mds_inode_init_acl(struct obd_device *obd, void *handle,
2245                               struct dentry *de, void *xattr, int xattr_size)
2246 {
2247         struct inode *inode = de->d_inode;
2248         struct posix_acl *acl;
2249         mode_t mode;
2250         int rc = 0;
2251
2252         LASSERT(handle);
2253         LASSERT(inode);
2254         LASSERT(xattr);
2255         LASSERT(xattr_size > 0);
2256
2257         if (!inode->i_op->getxattr || !inode->i_op->setxattr) {
2258                 CERROR("backend fs dosen't support xattr\n");
2259                 return -EOPNOTSUPP;
2260         }
2261
2262         /* set default acl */
2263         if (S_ISDIR(inode->i_mode)) {
2264                 rc = inode->i_op->setxattr(de, XATTR_NAME_ACL_DEFAULT,
2265                                            xattr, xattr_size, 0);
2266                 if (rc) {
2267                         CERROR("set default acl err: %d\n", rc);
2268                         return rc;
2269                 }
2270         }
2271
2272         /* set access acl */
2273         acl = posix_acl_from_xattr(xattr, xattr_size);
2274         if (acl == NULL || IS_ERR(acl)) {
2275                 CERROR("insane attr data\n");
2276                 return PTR_ERR(acl);
2277         }
2278
2279         if (posix_acl_valid(acl)) {
2280                 CERROR("default acl not valid: %d\n", rc);
2281                 rc = -EFAULT;
2282                 goto out;
2283         }
2284
2285         mode = inode->i_mode;
2286         rc = posix_acl_create_masq(acl, &mode);
2287         if (rc < 0) {
2288                 CERROR("create masq err %d\n", rc);
2289                 goto out;
2290         }
2291
2292         if (inode->i_mode != mode) {
2293                 struct iattr iattr = { .ia_valid = ATTR_MODE,
2294                                        .ia_mode = mode };
2295                 int rc2;
2296
2297                 rc2 = fsfilt_setattr(obd, de, handle, &iattr, 0);
2298                 if (rc2) {
2299                         CERROR("setattr mode err: %d\n", rc2);
2300                         rc = rc2;
2301                         goto out;
2302                 }
2303         }
2304
2305         if (rc > 0) {
2306                 /* we didn't change acl except mode bits of some
2307                  * entries, so should be fit into original size.
2308                  */
2309                 rc = posix_acl_to_xattr(acl, xattr, xattr_size);
2310                 LASSERT(rc > 0);
2311
2312                 rc = inode->i_op->setxattr(de, XATTR_NAME_ACL_ACCESS,
2313                                            xattr, xattr_size, 0);
2314                 if (rc)
2315                         CERROR("set access acl err: %d\n", rc);
2316         }
2317 out:
2318         posix_acl_release(acl);
2319         return rc;
2320 }
2321
2322 static int mdt_obj_create(struct ptlrpc_request *req)
2323 {
2324         struct obd_device *obd = req->rq_export->exp_obd;
2325         struct mds_obd *mds = &obd->u.mds;
2326         struct ost_body *body, *repbody;
2327         void *acl = NULL;
2328         int acl_size;
2329         char idname[LL_ID_NAMELEN];
2330         int size = sizeof(*repbody);
2331         struct inode *parent_inode;
2332         struct lvfs_run_ctxt saved;
2333         int rc, cleanup_phase = 0;
2334         struct dentry *new = NULL;
2335         struct dentry_params dp;
2336         int mealen, flags = 0;
2337         struct lvfs_ucred uc;
2338         struct lustre_id id;
2339         struct mea *mea;
2340         void *handle = NULL;
2341         unsigned long cr_inum = 0;
2342         __u64 fid = 0;
2343         ENTRY;
2344        
2345         DEBUG_REQ(D_HA, req, "create remote object");
2346         parent_inode = mds->mds_unnamed_dir->d_inode;
2347
2348         body = lustre_swab_reqbuf(req, 0, sizeof(*body),
2349                                   lustre_swab_ost_body);
2350         if (body == NULL)
2351                 RETURN(-EFAULT);
2352
2353         /* acl data is packed transparently, no swab here */
2354         LASSERT(req->rq_reqmsg->bufcount >= 2);
2355         acl_size = req->rq_reqmsg->buflens[1];
2356         if (acl_size) {
2357                 acl = lustre_msg_buf(req->rq_reqmsg, 1, acl_size);
2358                 if (!acl) {
2359                         CERROR("No default acl buf?\n");
2360                         RETURN(-EFAULT);
2361                 }
2362         }
2363
2364         rc = lustre_pack_reply(req, 1, &size, NULL);
2365         if (rc)
2366                 RETURN(rc);
2367
2368         MDS_CHECK_RESENT(req, reconstruct_create(req));
2369
2370         uc.luc_lsd = NULL;
2371         uc.luc_ginfo = NULL;
2372         uc.luc_uid = body->oa.o_uid;
2373         uc.luc_gid = body->oa.o_gid;
2374         uc.luc_fsuid = body->oa.o_uid;
2375         uc.luc_fsgid = body->oa.o_gid;
2376
2377         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
2378         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody));
2379
2380         /* in REPLAY case inum should be given (client or other MDS fills it) */
2381         if (body->oa.o_id && ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2382             (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))) {
2383                 /*
2384                  * this is re-create request from MDS holding directory name.
2385                  * we have to lookup given ino/gen first. if it exists (good
2386                  * case) then there is nothing to do. if it does not then we
2387                  * have to recreate it.
2388                  */
2389                 id_ino(&id) = body->oa.o_id;
2390                 id_gen(&id) = body->oa.o_generation;
2391  
2392                 new = mds_id2dentry(obd, &id, NULL);
2393                 if (!IS_ERR(new) && new->d_inode) {
2394                         struct lustre_id sid;
2395                                 
2396                         CDEBUG(D_OTHER, "mkdir repairing %lu/%lu\n",
2397                                (unsigned long)id_ino(&id),
2398                                (unsigned long)id_gen(&id));
2399                         
2400                         obdo_from_inode(&repbody->oa, new->d_inode,
2401                                         FILTER_VALID_FLAGS);
2402                         
2403                         repbody->oa.o_id = new->d_inode->i_ino;
2404                         repbody->oa.o_generation = new->d_inode->i_generation;
2405                         repbody->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER;
2406                         cleanup_phase = 1;
2407
2408                         down(&new->d_inode->i_sem);
2409                         rc = mds_read_inode_sid(obd, new->d_inode, &sid);
2410                         up(&new->d_inode->i_sem);
2411                         if (rc) {
2412                                 CERROR("Can't read inode self id "
2413                                        "inode %lu, rc %d.\n",
2414                                        new->d_inode->i_ino, rc);
2415                                 GOTO(cleanup, rc);
2416                         }
2417
2418                         repbody->oa.o_fid = id_fid(&sid);
2419                         repbody->oa.o_mds = id_group(&sid);
2420                         LASSERT(id_fid(&sid) != 0);
2421
2422                         /* 
2423                          * here we could use fid passed in body->oa.o_fid and
2424                          * thus avoid mds_read_inode_sid().
2425                          */
2426                         cr_inum = new->d_inode->i_ino;
2427                         GOTO(cleanup, rc = 0);
2428                 }
2429         }
2430         
2431         down(&parent_inode->i_sem);
2432         handle = fsfilt_start(obd, parent_inode, FSFILT_OP_MKDIR, NULL);
2433         if (IS_ERR(handle)) {
2434                 up(&parent_inode->i_sem);
2435                 CERROR("fsfilt_start() failed, rc = %d\n",
2436                        (int)PTR_ERR(handle));
2437                 GOTO(cleanup, rc = PTR_ERR(handle));
2438         }
2439         cleanup_phase = 1; /* transaction */
2440
2441 repeat:
2442         rc = sprintf(idname, "%u.%u", ll_insecure_random_int(), current->pid);
2443         new = lookup_one_len(idname, mds->mds_unnamed_dir, rc);
2444         if (IS_ERR(new)) {
2445                 CERROR("%s: can't lookup new inode (%s) for mkdir: %d\n",
2446                        obd->obd_name, idname, (int) PTR_ERR(new));
2447                 fsfilt_commit(obd, mds->mds_sb, new->d_inode, handle, 0);
2448                 up(&parent_inode->i_sem);
2449                 RETURN(PTR_ERR(new));
2450         } else if (new->d_inode) {
2451                 CERROR("%s: name exists. repeat\n", obd->obd_name);
2452                 goto repeat;
2453         }
2454         if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2455              lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2456                 fid = body->oa.o_fid;
2457         } else { 
2458                 fid = mds_alloc_fid(obd);
2459         }
2460         new->d_fsdata = (void *)&dp;
2461         dp.p_inum = 0;
2462         dp.p_ptr = req;
2463         dp.p_fid = fid;
2464         dp.p_group = mds->mds_num;
2465
2466         if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) ||
2467             (body->oa.o_flags & OBD_FL_RECREATE_OBJS)) {
2468                 LASSERT(body->oa.o_id != 0);
2469                 dp.p_inum = body->oa.o_id;
2470                 DEBUG_REQ(D_HA, req, "replay create obj %lu/%lu",
2471                           (unsigned long)body->oa.o_id,
2472                           (unsigned long)body->oa.o_generation);
2473         }
2474
2475         rc = vfs_mkdir(parent_inode, new, body->oa.o_mode);
2476         if (rc == 0) {
2477                 if (acl) {
2478                         rc = mds_inode_init_acl(obd, handle, new,
2479                                                 acl, acl_size);
2480                         if (rc) {
2481                                 up(&parent_inode->i_sem);
2482                                 GOTO(cleanup, rc);
2483                         }
2484                 }
2485                 if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2486                     lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2487                         new->d_inode->i_generation = body->oa.o_generation;
2488                         mark_inode_dirty(new->d_inode);
2489                         
2490                         /*
2491                          * avoiding asserts in cache flush case, as
2492                          * @body->oa.o_id should be zero.
2493                          */
2494                         if (body->oa.o_id) {
2495                                 LASSERTF(body->oa.o_id == new->d_inode->i_ino, 
2496                                          "BUG 3550: failed to recreate obj "
2497                                          LPU64" -> %lu\n", body->oa.o_id,
2498                                          new->d_inode->i_ino);
2499                                 
2500                                 LASSERTF(body->oa.o_generation == 
2501                                          new->d_inode->i_generation,
2502                                          "BUG 3550: failed to recreate obj/gen "
2503                                          LPU64"/%u -> %lu/%u\n", body->oa.o_id,
2504                                          body->oa.o_generation,
2505                                          new->d_inode->i_ino, 
2506                                          new->d_inode->i_generation);
2507                         }
2508                 }
2509                 
2510                 obdo_from_inode(&repbody->oa, new->d_inode, FILTER_VALID_FLAGS);
2511                 repbody->oa.o_id = new->d_inode->i_ino;
2512                 repbody->oa.o_generation = new->d_inode->i_generation;
2513                 repbody->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FID;
2514
2515                 if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2516                     lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2517                         id_group(&id) = mds->mds_num;
2518                 
2519                         LASSERT(body->oa.o_fid != 0);
2520                         id_fid(&id) = body->oa.o_fid;
2521
2522                         LASSERT(body->oa.o_id != 0);
2523                         id_ino(&id) = repbody->oa.o_id;
2524                         id_gen(&id) = repbody->oa.o_generation;
2525                 
2526                         down(&new->d_inode->i_sem);
2527                         rc = mds_update_inode_sid(obd, new->d_inode, handle, &id);
2528                         up(&new->d_inode->i_sem);
2529
2530                         /* 
2531                          * make sure, that fid is up-to-date.
2532                          */
2533                         mds_set_last_fid(obd, id_fid(&id));
2534                 } else {
2535                         /*
2536                          * allocate new sid, as object is created from scratch
2537                          * and this is not replay.
2538                          */
2539                         down(&new->d_inode->i_sem);
2540                         rc = mds_set_inode_sid(obd, new->d_inode, handle, &id, fid);
2541                         up(&new->d_inode->i_sem);
2542                 }
2543                 if (rc) {
2544                         CERROR("Can't update lustre ID for inode %lu, "
2545                                "error = %d\n", new->d_inode->i_ino, rc);
2546                         GOTO(cleanup, rc);
2547                 }
2548
2549                 /* initializing o_fid after it is allocated. */
2550                 repbody->oa.o_fid = id_fid(&id);
2551                 repbody->oa.o_mds = id_group(&id);
2552
2553                 rc = fsfilt_del_dir_entry(obd, new);
2554                 up(&parent_inode->i_sem);
2555                 if (rc) {
2556                         CERROR("can't remove name for object: %d\n", rc);
2557                         GOTO(cleanup, rc);
2558                 }
2559                 
2560                 cleanup_phase = 2; /* created directory object */
2561
2562                 CDEBUG(D_OTHER, "created dirobj: %lu/%lu mode %o\n",
2563                        (unsigned long)new->d_inode->i_ino,
2564                        (unsigned long)new->d_inode->i_generation,
2565                        (unsigned)new->d_inode->i_mode);
2566                 cr_inum = new->d_inode->i_ino;
2567         } else {
2568                 up(&parent_inode->i_sem);
2569                 CERROR("%s: can't create dirobj: %d\n", obd->obd_name, rc);
2570                 GOTO(cleanup, rc);
2571         }
2572
2573         if (body->oa.o_valid & OBD_MD_FLID) {
2574                 /* this is new object for splitted dir. We have to prevent
2575                  * recursive splitting on it -bzzz */
2576                 mealen = obd_size_diskmd(mds->mds_md_exp, NULL);
2577
2578                 OBD_ALLOC(mea, mealen);
2579                 if (mea == NULL)
2580                         GOTO(cleanup, rc = -ENOMEM);
2581
2582                 mea->mea_magic = MEA_MAGIC_ALL_CHARS;
2583                 mea->mea_master = 0;
2584                 mea->mea_count = 0;
2585
2586                 down(&new->d_inode->i_sem);
2587                 rc = fsfilt_set_md(obd, new->d_inode, handle,
2588                                    mea, mealen, EA_MEA);
2589                 up(&new->d_inode->i_sem);
2590                 if (rc)
2591                         CERROR("fsfilt_set_md() failed, "
2592                                "rc = %d\n", rc);
2593
2594                 OBD_FREE(mea, mealen);
2595                 
2596                 CDEBUG(D_OTHER, "%s: mark non-splittable %lu/%u - %d\n",
2597                        obd->obd_name, new->d_inode->i_ino,
2598                        new->d_inode->i_generation, flags);
2599         } else if (body->oa.o_easize) {
2600                 /* we pass LCK_EX to split routine to signal that we have
2601                  * exclusive access to the directory. simple because nobody
2602                  * knows it already exists -bzzz */
2603                 rc = mds_try_to_split_dir(obd, new, NULL,
2604                                           body->oa.o_easize, LCK_EX);
2605                 if (rc < 0) {
2606                         CERROR("Can't split directory %lu, error = %d.\n",
2607                                new->d_inode->i_ino, rc);
2608                 } else {
2609                         rc = 0;
2610                 }
2611         }
2612
2613         EXIT;
2614 cleanup:
2615         switch (cleanup_phase) {
2616         case 2: /* object has been created, but we'll may want to replay it later */
2617                 if (rc == 0)
2618                         ptlrpc_require_repack(req);
2619         case 1: /* transaction */
2620                 rc = mds_finish_transno(mds, parent_inode, handle,
2621                                         req, rc, cr_inum);
2622         }
2623
2624         l_dput(new);
2625         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
2626         return rc;
2627 }
2628
2629 static int mdt_get_info(struct ptlrpc_request *req)
2630 {
2631         struct obd_export *exp = req->rq_export;
2632         int keylen, rc = 0;
2633         char *key;
2634         ENTRY;
2635
2636         key = lustre_msg_buf(req->rq_reqmsg, 0, 1);
2637         if (key == NULL) {
2638                 DEBUG_REQ(D_HA, req, "no get_info key");
2639                 RETURN(-EFAULT);
2640         }
2641         keylen = req->rq_reqmsg->buflens[0];
2642
2643         if ((keylen < strlen("mdsize") || strcmp(key, "mdsize") != 0) &&
2644             (keylen < strlen("mdsnum") || strcmp(key, "mdsnum") != 0) &&
2645             (keylen < strlen("rootid") || strcmp(key, "rootid") != 0))
2646                 RETURN(-EPROTO);
2647
2648         if (keylen >= strlen("rootid") && !strcmp(key, "rootid")) {
2649                 struct lustre_id *reply;
2650                 int size = sizeof(*reply);
2651                 
2652                 rc = lustre_pack_reply(req, 1, &size, NULL);
2653                 if (rc)
2654                         RETURN(rc);
2655
2656                 reply = lustre_msg_buf(req->rq_repmsg, 0, size);
2657                 rc = obd_get_info(exp, keylen, key, (__u32 *)&size, reply);
2658         } else {
2659                 obd_id *reply;
2660                 int size = sizeof(*reply);
2661                 
2662                 rc = lustre_pack_reply(req, 1, &size, NULL);
2663                 if (rc)
2664                         RETURN(rc);
2665
2666                 reply = lustre_msg_buf(req->rq_repmsg, 0, size);
2667                 rc = obd_get_info(exp, keylen, key, (__u32 *)&size, reply);
2668         }
2669
2670         req->rq_repmsg->status = 0;
2671         RETURN(rc);
2672 }
2673
2674 static int mds_set_info(struct obd_export *exp, __u32 keylen,
2675                         void *key, __u32 vallen, void *val)
2676 {
2677         struct obd_device *obd;
2678         struct mds_obd *mds;
2679         int rc = 0;
2680         ENTRY;
2681
2682         obd = class_exp2obd(exp);
2683         if (obd == NULL) {
2684                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
2685                        exp->exp_handle.h_cookie);
2686                 RETURN(-EINVAL);
2687         }
2688
2689         mds = &obd->u.mds;
2690         if (keylen >= strlen("mds_type") &&
2691              memcmp(key, "mds_type", keylen) == 0) {
2692                 int valsize;
2693                 __u32 group;
2694                 
2695                 CDEBUG(D_IOCTL, "set mds type to %x\n", *(int*)val);
2696                 
2697                 mds->mds_obd_type = *(int*)val;
2698                 group = FILTER_GROUP_FIRST_MDS + mds->mds_obd_type;
2699                 valsize = sizeof(group);
2700                 
2701                 /* mds number has been changed, so the corresponding obdfilter
2702                  * exp need to be changed too. */
2703                 rc = obd_set_info(mds->mds_dt_exp, strlen("mds_conn"),
2704                                   "mds_conn", valsize, &group);
2705                 RETURN(rc);
2706         }
2707         if (keylen >= strlen("crypto_type") &&
2708              memcmp(key, "crypto_type", keylen) == 0) {
2709                 rc = mds_set_crypto_type(obd, val, vallen); 
2710                 RETURN(rc);
2711         }
2712
2713         CDEBUG(D_IOCTL, "invalid key\n");
2714         RETURN(-EINVAL);
2715 }
2716
2717 static int mdt_set_info(struct ptlrpc_request *req)
2718 {
2719         char *key, *val;
2720         struct obd_export *exp = req->rq_export;
2721         int keylen, rc = 0, vallen;
2722         ENTRY;
2723
2724         key = lustre_msg_buf(req->rq_reqmsg, 0, 1);
2725         if (key == NULL) {
2726                 DEBUG_REQ(D_HA, req, "no set_info key");
2727                 RETURN(-EFAULT);
2728         }
2729         keylen = req->rq_reqmsg->buflens[0];
2730
2731         if ((keylen == strlen("mds_type") &&
2732             memcmp(key, "mds_type", keylen) == 0) ||
2733             (keylen == strlen("crypto_type") &&
2734             memcmp(key, "crypto_type", keylen) == 0)) {
2735                 rc = lustre_pack_reply(req, 0, NULL, NULL);
2736                 if (rc)
2737                         RETURN(rc);
2738                 
2739                 val = lustre_msg_buf(req->rq_reqmsg, 1, 0);
2740                 vallen = req->rq_reqmsg->buflens[1];
2741
2742                 rc = obd_set_info(exp, keylen, key, vallen, val);
2743                 req->rq_repmsg->status = 0;
2744                 RETURN(rc);
2745         }
2746         CDEBUG(D_IOCTL, "invalid key\n");
2747         RETURN(-EINVAL);
2748 }
2749
2750 static void mds_revoke_export_locks(struct obd_export *exp)
2751 {
2752         struct list_head *locklist = &exp->exp_ldlm_data.led_held_locks;
2753         struct list_head work;
2754         struct ldlm_lock *lock, *next;
2755         struct ldlm_lock_desc desc;
2756
2757         if (!exp->u.eu_mds_data.med_remote)
2758                 return;
2759
2760         ENTRY;
2761         CERROR("implement right locking here! -bzzz\n");
2762         INIT_LIST_HEAD(&work);
2763         spin_lock(&exp->exp_ldlm_data.led_lock);
2764         list_for_each_entry_safe(lock, next, locklist, l_export_chain) {
2765
2766                 lock_res_and_lock(lock);
2767                 if (lock->l_req_mode != lock->l_granted_mode) {
2768                         unlock_res_and_lock(lock);
2769                         continue;
2770                 }
2771
2772                 LASSERT(lock->l_resource);
2773                 if (lock->l_resource->lr_type != LDLM_IBITS &&
2774                     lock->l_resource->lr_type != LDLM_PLAIN) {
2775                         unlock_res_and_lock(lock);
2776                         continue;
2777                 }
2778
2779                 if (lock->l_flags & LDLM_FL_AST_SENT) {
2780                         unlock_res_and_lock(lock);
2781                         continue;
2782                 }
2783
2784                 lock->l_flags |= LDLM_FL_AST_SENT;
2785                 unlock_res_and_lock(lock);
2786
2787                 /* the desc just pretend to exclusive */
2788                 ldlm_lock2desc(lock, &desc);
2789                 desc.l_req_mode = LCK_EX;
2790                 desc.l_granted_mode = 0;
2791
2792                 lock->l_blocking_ast(lock, &desc, NULL, LDLM_CB_BLOCKING);
2793         }
2794         spin_unlock(&exp->exp_ldlm_data.led_lock);
2795
2796         EXIT;
2797 }
2798
2799 static int mds_msg_check_version(struct lustre_msg *msg)
2800 {
2801         int rc;
2802
2803         switch (msg->opc) {
2804         case MDS_CONNECT:
2805         case MDS_DISCONNECT:
2806         case OBD_PING:
2807                 rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
2808                 if (rc)
2809                         CERROR("bad opc %u version %08x, expecting %08x\n",
2810                                msg->opc, msg->version, LUSTRE_OBD_VERSION);
2811                 break;
2812         case MDS_STATFS:
2813         case MDS_GETSTATUS:
2814         case MDS_GETATTR:
2815         case MDS_GETATTR_LOCK:
2816         case MDS_ACCESS_CHECK:
2817         case MDS_READPAGE:
2818         case MDS_REINT:
2819         case MDS_CLOSE:
2820         case MDS_DONE_WRITING:
2821         case MDS_PIN:
2822         case MDS_SYNC:
2823                 rc = lustre_msg_check_version(msg, LUSTRE_MDS_VERSION);
2824                 if (rc)
2825                         CERROR("bad opc %u version %08x, expecting %08x\n",
2826                                msg->opc, msg->version, LUSTRE_MDS_VERSION);
2827                 break;
2828         case LDLM_ENQUEUE:
2829         case LDLM_CONVERT:
2830         case LDLM_BL_CALLBACK:
2831         case LDLM_CP_CALLBACK:
2832                 rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION);
2833                 if (rc)
2834                         CERROR("bad opc %u version %08x, expecting %08x\n",
2835                                msg->opc, msg->version, LUSTRE_DLM_VERSION);
2836                 break;
2837         case OBD_LOG_CANCEL:
2838         case LLOG_ORIGIN_HANDLE_OPEN:
2839         case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
2840         case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
2841         case LLOG_ORIGIN_HANDLE_READ_HEADER:
2842         case LLOG_ORIGIN_HANDLE_CLOSE:
2843         case LLOG_CATINFO:
2844                 rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION);
2845                 if (rc)
2846                         CERROR("bad opc %u version %08x, expecting %08x\n",
2847                                msg->opc, msg->version, LUSTRE_LOG_VERSION);
2848                 break;
2849         case OST_CREATE:
2850         case OST_WRITE:
2851         case OST_GET_INFO:
2852         case OST_SET_INFO:
2853                 rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
2854                 if (rc)
2855                         CERROR("bad opc %u version %08x, expecting %08x\n",
2856                                msg->opc, msg->version, LUSTRE_OBD_VERSION);
2857                 break;
2858         case SEC_INIT:
2859         case SEC_INIT_CONTINUE:
2860         case SEC_FINI:
2861                 rc = 0;
2862                 break;
2863         default:
2864                 CERROR("MDS unknown opcode %d\n", msg->opc);
2865                 rc = -ENOTSUPP;
2866                 break;
2867         }
2868
2869         return rc;
2870 }
2871
2872 int mds_handle(struct ptlrpc_request *req)
2873 {
2874         int should_process, fail = OBD_FAIL_MDS_ALL_REPLY_NET;
2875         struct obd_device *obd = NULL;
2876         struct mds_obd *mds = NULL; /* quell gcc overwarning */
2877         int rc = 0;
2878         ENTRY;
2879
2880         OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
2881
2882         rc = mds_msg_check_version(req->rq_reqmsg);
2883         if (rc) {
2884                 CERROR("MDS drop mal-formed request\n");
2885                 RETURN(rc);
2886         }
2887
2888         /* Security opc should NOT trigger any recovery events */
2889         if (req->rq_reqmsg->opc == SEC_INIT ||
2890             req->rq_reqmsg->opc == SEC_INIT_CONTINUE) {
2891                 if (req->rq_export) {
2892                         mds_req_add_idmapping(req,
2893                                               &req->rq_export->exp_mds_data);
2894                         mds_revoke_export_locks(req->rq_export);
2895                 }
2896                 GOTO(out, rc = 0);
2897         } else if (req->rq_reqmsg->opc == SEC_FINI) {
2898                 if (req->rq_export) {
2899                         mds_req_del_idmapping(req,
2900                                               &req->rq_export->exp_mds_data);
2901                         mds_revoke_export_locks(req->rq_export);
2902                 }
2903                 GOTO(out, rc = 0);
2904         }
2905
2906         LASSERT(current->journal_info == NULL);
2907         /* XXX identical to OST */
2908         if (req->rq_reqmsg->opc != MDS_CONNECT) {
2909                 struct mds_export_data *med;
2910                 int recovering;
2911
2912                 if (req->rq_export == NULL) {
2913                         CERROR("operation %d on unconnected MDS from %s\n",
2914                                req->rq_reqmsg->opc,
2915                                req->rq_peerstr);
2916                         req->rq_status = -ENOTCONN;
2917                         GOTO(out, rc = -ENOTCONN);
2918                 }
2919
2920                 med = &req->rq_export->exp_mds_data;
2921                 obd = req->rq_export->exp_obd;
2922                 mds = &obd->u.mds;
2923
2924                 /* sanity check: if the xid matches, the request must
2925                  * be marked as a resent or replayed */
2926                 if (req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_xid) ||
2927                    req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_close_xid)) {
2928                         LASSERTF(lustre_msg_get_flags(req->rq_reqmsg) &
2929                                  (MSG_RESENT | MSG_REPLAY),
2930                                  "rq_xid "LPU64" matches last_xid, "
2931                                  "expected RESENT flag\n",
2932                                  req->rq_xid);
2933                 }
2934                 /* else: note the opposite is not always true; a
2935                  * RESENT req after a failover will usually not match
2936                  * the last_xid, since it was likely never
2937                  * committed. A REPLAYed request will almost never
2938                  * match the last xid, however it could for a
2939                  * committed, but still retained, open. */
2940
2941                 spin_lock_bh(&obd->obd_processing_task_lock);
2942                 recovering = obd->obd_recovering;
2943                 spin_unlock_bh(&obd->obd_processing_task_lock);
2944                 if (recovering) {
2945                         rc = mds_filter_recovery_request(req, obd,
2946                                                          &should_process);
2947                         if (rc || should_process == 0) {
2948                                 RETURN(rc);
2949                         } else if (should_process < 0) {
2950                                 req->rq_status = should_process;
2951                                 rc = ptlrpc_error(req);
2952                                 RETURN(rc);
2953                         }
2954                 }
2955         }
2956
2957         switch (req->rq_reqmsg->opc) {
2958         case MDS_CONNECT:
2959                 DEBUG_REQ(D_INODE, req, "connect");
2960                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0);
2961                 rc = target_handle_connect(req);
2962                 if (!rc) {
2963                         struct mds_export_data *med;
2964
2965                         LASSERT(req->rq_export);
2966                         med = &req->rq_export->u.eu_mds_data;
2967                         mds_init_export_data(req, med);
2968                         mds_req_add_idmapping(req, med);
2969
2970                         /* Now that we have an export, set mds. */
2971                         obd = req->rq_export->exp_obd;
2972                         mds = mds_req2mds(req);
2973                 }
2974                 break;
2975
2976         case MDS_DISCONNECT:
2977                 DEBUG_REQ(D_INODE, req, "disconnect");
2978                 OBD_FAIL_RETURN(OBD_FAIL_MDS_DISCONNECT_NET, 0);
2979                 rc = target_handle_disconnect(req);
2980                 req->rq_status = rc;            /* superfluous? */
2981                 break;
2982
2983         case MDS_GETSTATUS:
2984                 DEBUG_REQ(D_INODE, req, "getstatus");
2985                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETSTATUS_NET, 0);
2986                 rc = mds_getstatus(req);
2987                 break;
2988
2989         case MDS_GETATTR:
2990                 DEBUG_REQ(D_INODE, req, "getattr");
2991                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NET, 0);
2992                 rc = mds_getattr(req, MDS_REQ_REC_OFF);
2993                 break;
2994
2995         case MDS_ACCESS_CHECK:
2996                 DEBUG_REQ(D_INODE, req, "access_check");
2997                 OBD_FAIL_RETURN(OBD_FAIL_MDS_ACCESS_CHECK_NET, 0);
2998                 rc = mds_access_check(req, MDS_REQ_REC_OFF);
2999                 break;
3000
3001         case MDS_GETATTR_LOCK: {
3002                 struct lustre_handle lockh;
3003                 DEBUG_REQ(D_INODE, req, "getattr_lock");
3004                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_LOCK_NET, 0);
3005
3006                 /* If this request gets a reconstructed reply, we won't be
3007                  * acquiring any new locks in mds_getattr_lock, so we don't
3008                  * want to cancel.
3009                  */
3010                 lockh.cookie = 0;
3011                 rc = mds_getattr_lock(req, MDS_REQ_REC_OFF, &lockh,
3012                                       MDS_INODELOCK_UPDATE);
3013                 /* this non-intent call (from an ioctl) is special */
3014                 req->rq_status = rc;
3015                 if (rc == 0 && lockh.cookie)
3016                         ldlm_lock_decref(&lockh, LCK_PR);
3017                 break;
3018         }
3019         case MDS_STATFS:
3020                 DEBUG_REQ(D_INODE, req, "statfs");
3021                 OBD_FAIL_RETURN(OBD_FAIL_MDS_STATFS_NET, 0);
3022                 rc = mds_statfs(req);
3023                 break;
3024
3025         case MDS_READPAGE:
3026                 DEBUG_REQ(D_INODE, req, "readpage");
3027                 OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
3028                 rc = mds_readpage(req, MDS_REQ_REC_OFF);
3029
3030                 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
3031                         if (req->rq_reply_state) {
3032                                 lustre_free_reply_state (req->rq_reply_state);
3033                                 req->rq_reply_state = NULL;
3034                         }
3035                         RETURN(0);
3036                 }
3037
3038                 break;
3039         case MDS_REINT: {
3040                 __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF,
3041                                              sizeof (*opcp));
3042                 __u32  opc;
3043                 int size[3] = {sizeof(struct mds_body), mds->mds_max_mdsize,
3044                                mds->mds_max_cookiesize};
3045                 int bufcount;
3046
3047                 /* NB only peek inside req now; mds_reint() will swab it */
3048                 if (opcp == NULL) {
3049                         CERROR ("Can't inspect opcode\n");
3050                         rc = -EINVAL;
3051                         break;
3052                 }
3053                 opc = *opcp;
3054                 if (lustre_msg_swabbed (req->rq_reqmsg))
3055                         __swab32s(&opc);
3056
3057                 DEBUG_REQ(D_INODE, req, "reint %d (%s)", opc,
3058                           (opc < sizeof(reint_names) / sizeof(reint_names[0]) ||
3059                            reint_names[opc] == NULL) ? reint_names[opc] :
3060                                                        "unknown opcode");
3061
3062                 OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET, 0);
3063
3064                 if (opc == REINT_UNLINK || opc == REINT_RENAME)
3065                         bufcount = 3;
3066                 else if (opc == REINT_OPEN)
3067                         bufcount = 2;
3068                 else
3069                         bufcount = 1;
3070
3071                 /* for SETATTR: I have different reply setting for
3072                  * remote setfacl, so delay the reply buffer allocation.
3073                  */
3074                 if (opc != REINT_SETATTR) {
3075                         rc = lustre_pack_reply(req, bufcount, size, NULL);
3076                         if (rc)
3077                                 break;
3078                 }
3079
3080                 rc = mds_reint(req, MDS_REQ_REC_OFF, NULL);
3081                 fail = OBD_FAIL_MDS_REINT_NET_REP;
3082                 break;
3083         }
3084
3085         case MDS_CLOSE:
3086                 DEBUG_REQ(D_INODE, req, "close");
3087                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CLOSE_NET, 0);
3088                 rc = mds_close(req, MDS_REQ_REC_OFF);
3089                 break;
3090
3091         case MDS_DONE_WRITING:
3092                 DEBUG_REQ(D_INODE, req, "done_writing");
3093                 OBD_FAIL_RETURN(OBD_FAIL_MDS_DONE_WRITING_NET, 0);
3094                 rc = mds_done_writing(req, MDS_REQ_REC_OFF);
3095                 break;
3096
3097         case MDS_PIN:
3098                 DEBUG_REQ(D_INODE, req, "pin");
3099                 OBD_FAIL_RETURN(OBD_FAIL_MDS_PIN_NET, 0);
3100                 rc = mds_pin(req, MDS_REQ_REC_OFF);
3101                 break;
3102
3103         case MDS_SYNC:
3104                 DEBUG_REQ(D_INODE, req, "sync");
3105                 OBD_FAIL_RETURN(OBD_FAIL_MDS_SYNC_NET, 0);
3106                 rc = mds_sync(req, MDS_REQ_REC_OFF);
3107                 break;
3108
3109         case OBD_PING:
3110                 DEBUG_REQ(D_INODE, req, "ping");
3111                 rc = target_handle_ping(req);
3112                 break;
3113
3114         case OBD_LOG_CANCEL:
3115                 CDEBUG(D_INODE, "log cancel\n");
3116                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
3117                 rc = -ENOTSUPP; /* la la la */
3118                 break;
3119
3120         case LDLM_ENQUEUE:
3121                 DEBUG_REQ(D_INODE, req, "enqueue");
3122                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
3123                 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
3124                                          ldlm_server_blocking_ast, NULL);
3125                 fail = OBD_FAIL_LDLM_REPLY;
3126                 break;
3127         case LDLM_CONVERT:
3128                 DEBUG_REQ(D_INODE, req, "convert");
3129                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CONVERT, 0);
3130                 rc = ldlm_handle_convert(req);
3131                 break;
3132         case LDLM_BL_CALLBACK:
3133         case LDLM_CP_CALLBACK:
3134                 DEBUG_REQ(D_INODE, req, "callback");
3135                 CERROR("callbacks should not happen on MDS\n");
3136                 LBUG();
3137                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0);
3138                 break;
3139         case LLOG_ORIGIN_HANDLE_OPEN:
3140                 DEBUG_REQ(D_INODE, req, "llog_init");
3141                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3142                 rc = llog_origin_handle_open(req);
3143                 break;
3144         case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
3145                 DEBUG_REQ(D_INODE, req, "llog next block");
3146                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3147                 rc = llog_origin_handle_next_block(req);
3148                 break;
3149         case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
3150                 DEBUG_REQ(D_INODE, req, "llog prev block");
3151                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3152                 rc = llog_origin_handle_prev_block(req);
3153                 break;
3154         case LLOG_ORIGIN_HANDLE_READ_HEADER:
3155                 DEBUG_REQ(D_INODE, req, "llog read header");
3156                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3157                 rc = llog_origin_handle_read_header(req);
3158                 break;
3159         case LLOG_ORIGIN_HANDLE_CLOSE:
3160                 DEBUG_REQ(D_INODE, req, "llog close");
3161                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3162                 rc = llog_origin_handle_close(req);
3163                 break;
3164         case OST_CREATE:
3165                 DEBUG_REQ(D_INODE, req, "ost_create");
3166                 rc = mdt_obj_create(req);
3167                 break;
3168         case OST_GET_INFO:
3169                 DEBUG_REQ(D_INODE, req, "get_info");
3170                 rc = mdt_get_info(req);
3171                 break;
3172         case OST_SET_INFO:
3173                 DEBUG_REQ(D_INODE, req, "set_info");
3174                 rc = mdt_set_info(req);
3175                 break;
3176         case OST_WRITE:
3177                 CDEBUG(D_INODE, "write\n");
3178                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
3179                 rc = ost_brw_write(req, NULL);
3180                 LASSERT(current->journal_info == NULL);
3181                 /* mdt_brw sends its own replies */
3182                 RETURN(rc);
3183                 break;
3184         case LLOG_CATINFO:
3185                 DEBUG_REQ(D_INODE, req, "llog catinfo");
3186                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3187                 rc = llog_catinfo(req);
3188                 break;
3189         default:
3190                 req->rq_status = -ENOTSUPP;
3191                 rc = ptlrpc_error(req);
3192                 RETURN(rc);
3193         }
3194
3195         LASSERT(current->journal_info == NULL);
3196
3197         EXIT;
3198
3199         /* If we're DISCONNECTing, the mds_export_data is already freed */
3200         if (!rc && req->rq_reqmsg->opc != MDS_DISCONNECT) {
3201                 struct mds_export_data *med = &req->rq_export->exp_mds_data;
3202                 struct obd_device *obd = list_entry(mds, struct obd_device,
3203                                                     u.mds);
3204                 req->rq_repmsg->last_xid =
3205                         le64_to_cpu(med->med_mcd->mcd_last_xid);
3206
3207                 if (!obd->obd_no_transno) {
3208                         req->rq_repmsg->last_committed =
3209                                 obd->obd_last_committed;
3210                 } else {