Whamcloud - gitweb
41b73c5fd3d284fa2c31f06cc4d485d04790be65
[fs/lustre-release.git] / lustre / mds / handler.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/mds/handler.c
5  *  Lustre Metadata Server (mds) request handler
6  *
7  *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
8  *   Author: Peter Braam <braam@clusterfs.com>
9  *   Author: Andreas Dilger <adilger@clusterfs.com>
10  *   Author: Phil Schwan <phil@clusterfs.com>
11  *   Author: Mike Shaver <shaver@clusterfs.com>
12  *
13  *   This file is part of Lustre, http://www.lustre.org.
14  *
15  *   Lustre is free software; you can redistribute it and/or
16  *   modify it under the terms of version 2 of the GNU General Public
17  *   License as published by the Free Software Foundation.
18  *
19  *   Lustre is distributed in the hope that it will be useful,
20  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
21  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22  *   GNU General Public License for more details.
23  *
24  *   You should have received a copy of the GNU General Public License
25  *   along with Lustre; if not, write to the Free Software
26  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27  */
28
29 #ifndef EXPORT_SYMTAB
30 # define EXPORT_SYMTAB
31 #endif
32 #define DEBUG_SUBSYSTEM S_MDS
33
34 #include <linux/module.h>
35 #include <linux/lustre_mds.h>
36 #include <linux/lustre_dlm.h>
37 #include <linux/init.h>
38 #include <linux/obd_class.h>
39 #include <linux/random.h>
40 #include <linux/fs.h>
41 #include <linux/jbd.h>
42 #include <linux/namei.h>
43 #include <linux/ext3_fs.h>
44 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
45 # include <linux/smp_lock.h>
46 # include <linux/buffer_head.h>
47 # include <linux/workqueue.h>
48 # include <linux/mount.h>
49 #else
50 # include <linux/locks.h>
51 #endif
52 #include <linux/obd_lov.h>
53 #include <linux/obd_ost.h>
54 #include <linux/lustre_mds.h>
55 #include <linux/lustre_fsfilt.h>
56 #include <linux/lprocfs_status.h>
57 #include <linux/lustre_commit_confd.h>
58 #include <linux/lustre_acl.h>
59 #include <linux/lustre_gs.h>
60 #include "mds_internal.h"
61 #include <linux/lustre_sec.h>
62
63 static int mds_intent_policy(struct ldlm_namespace *ns,
64                              struct ldlm_lock **lockp, void *req_cookie,
65                              ldlm_mode_t mode, int flags, void *data);
66 static int mds_postsetup(struct obd_device *obd);
67 static int mds_cleanup(struct obd_device *obd, int flags);
68
69
70 /* Assumes caller has already pushed into the kernel filesystem context */
71 static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
72                         loff_t offset, int count)
73 {
74         struct ptlrpc_bulk_desc *desc;
75         struct l_wait_info lwi;
76         struct page **pages;
77         int rc = 0, npages, i, tmpcount, tmpsize = 0;
78         ENTRY;
79
80         LASSERT((offset & (PAGE_SIZE - 1)) == 0); /* I'm dubious about this */
81
82         npages = (count + PAGE_SIZE - 1) >> PAGE_SHIFT;
83         OBD_ALLOC(pages, sizeof(*pages) * npages);
84         if (!pages)
85                 GOTO(out, rc = -ENOMEM);
86
87         desc = ptlrpc_prep_bulk_exp(req, npages, BULK_PUT_SOURCE,
88                                     MDS_BULK_PORTAL);
89         if (desc == NULL)
90                 GOTO(out_free, rc = -ENOMEM);
91
92         for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
93                 tmpsize = tmpcount > PAGE_SIZE ? PAGE_SIZE : tmpcount;
94
95                 pages[i] = alloc_pages(GFP_KERNEL, 0);
96                 if (pages[i] == NULL)
97                         GOTO(cleanup_buf, rc = -ENOMEM);
98
99                 ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
100         }
101
102         for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
103                 tmpsize = tmpcount > PAGE_SIZE ? PAGE_SIZE : tmpcount;
104                 CDEBUG(D_EXT2, "reading %u@%llu from dir %lu (size %llu)\n",
105                        tmpsize, offset, file->f_dentry->d_inode->i_ino,
106                        file->f_dentry->d_inode->i_size);
107
108                 rc = fsfilt_readpage(req->rq_export->exp_obd, file,
109                                      kmap(pages[i]), tmpsize, &offset);
110                 kunmap(pages[i]);
111
112                 if (rc != tmpsize)
113                         GOTO(cleanup_buf, rc = -EIO);
114         }
115
116         LASSERT(desc->bd_nob == count);
117
118         rc = ptlrpc_start_bulk_transfer(desc);
119         if (rc)
120                 GOTO(cleanup_buf, rc);
121
122         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
123                 CERROR("obd_fail_loc=%x, fail operation rc=%d\n",
124                        OBD_FAIL_MDS_SENDPAGE, rc = -EIO);
125                 GOTO(abort_bulk, rc);
126         }
127
128         lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
129         rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
130         LASSERT (rc == 0 || rc == -ETIMEDOUT);
131
132         if (rc == 0) {
133                 if (desc->bd_success &&
134                     desc->bd_nob_transferred == count)
135                         GOTO(cleanup_buf, rc);
136
137                 rc = -ETIMEDOUT; /* XXX should this be a different errno? */
138         }
139
140         DEBUG_REQ(D_ERROR, req, "bulk failed: %s %d(%d), evicting %s@%s\n",
141                   (rc == -ETIMEDOUT) ? "timeout" : "network error",
142                   desc->bd_nob_transferred, count,
143                   req->rq_export->exp_client_uuid.uuid,
144                   req->rq_export->exp_connection->c_remote_uuid.uuid);
145
146         ptlrpc_fail_export(req->rq_export);
147
148         EXIT;
149  abort_bulk:
150         ptlrpc_abort_bulk (desc);
151  cleanup_buf:
152         for (i = 0; i < npages; i++)
153                 if (pages[i])
154                         __free_pages(pages[i], 0);
155
156         ptlrpc_free_bulk(desc);
157  out_free:
158         OBD_FREE(pages, sizeof(*pages) * npages);
159  out:
160         return rc;
161 }
162
163 extern char *ldlm_lockname[];
164
165 int mds_lock_mode_for_dir(struct obd_device *obd,
166                           struct dentry *dentry, int mode)
167 {
168         int ret_mode = 0, split;
169
170         /* any dir access needs couple locks:
171          * 1) on part of dir we gonna lookup/modify in
172          * 2) on a whole dir to protect it from concurrent splitting
173          *    and to flush client's cache for readdir()
174          * so, for a given mode and dentry this routine decides what
175          * lock mode to use for lock #2:
176          * 1) if caller's gonna lookup in dir then we need to protect
177          *    dir from being splitted only - LCK_CR
178          * 2) if caller's gonna modify dir then we need to protect
179          *    dir from being splitted and to flush cache - LCK_CW
180          * 3) if caller's gonna modify dir and that dir seems ready
181          *    for splitting then we need to protect it from any
182          *    type of access (lookup/modify/split) - LCK_EX -bzzz */
183
184         split = mds_splitting_expected(obd, dentry);
185         
186         /*
187          * it is important to check here only for MDS_NO_SPLITTABLE. The reason
188          * is that MDS_NO_SPLITTABLE means dir is not splittable in principle
189          * and another thread will not split it on the quiet. But if we have
190          * MDS_NO_SPLIT_EXPECTED, this means, that dir may be splitted anytime,
191          * but not now (for current thread) and we should consider that it can
192          * happen soon and go that branch which can yield LCK_EX to protect from
193          * possible splitting.
194          */
195         if (split == MDS_NO_SPLITTABLE) {
196                 /*
197                  * this inode won't be splitted. so we need not to protect from
198                  * just flush client's cache on modification.
199                  */
200                 if (mode == LCK_PW)
201                         ret_mode = LCK_CW;
202                 else
203                         ret_mode = 0;
204         } else {
205                 if (mode == LCK_EX) {
206                         ret_mode = LCK_EX;
207                 } else if (mode == LCK_PR) {
208                         ret_mode = LCK_CR;
209                 } else if (mode == LCK_PW) {
210                         /*
211                          * caller gonna modify directory. We use concurrent
212                          * write lock here to retract client's cache for
213                          * readdir.
214                          */
215                         if (split == MDS_EXPECT_SPLIT) {
216                                 /*
217                                  * splitting possible. serialize any access the
218                                  * idea is that first one seen dir is splittable
219                                  * is given exclusive lock and split
220                                  * directory. caller passes lock mode to
221                                  * mds_try_to_split_dir() and splitting would be
222                                  * done with exclusive lock only -bzzz.
223                                  */
224                                 CDEBUG(D_OTHER, "%s: gonna split %lu/%lu\n",
225                                        obd->obd_name,
226                                        (unsigned long)dentry->d_inode->i_ino,
227                                        (unsigned long)dentry->d_inode->i_generation);
228                                 ret_mode = LCK_EX;
229                         } else {
230                                 ret_mode = LCK_CW;
231                         }
232                 }
233         }
234
235         return ret_mode;        
236 }
237
238 /* only valid locked dentries or errors should be returned */
239 struct dentry *mds_id2locked_dentry(struct obd_device *obd, struct lustre_id *id,
240                                     struct vfsmount **mnt, int lock_mode,
241                                     struct lustre_handle *lockh, int *mode,
242                                     char *name, int namelen, __u64 lockpart)
243 {
244         struct dentry *de = mds_id2dentry(obd, id, mnt), *retval = de;
245         ldlm_policy_data_t policy = { .l_inodebits = { lockpart } };
246         struct ldlm_res_id res_id = { .name = {0} };
247         int flags = LDLM_FL_ATOMIC_CB, rc;
248         ENTRY;
249
250         if (IS_ERR(de))
251                 RETURN(de);
252
253         lockh[1].cookie = 0;
254         res_id.name[0] = id_fid(id);
255         res_id.name[1] = id_group(id);
256         
257 #ifdef S_PDIROPS
258         if (name && IS_PDIROPS(de->d_inode)) {
259                 ldlm_policy_data_t cpolicy =
260                         { .l_inodebits = { MDS_INODELOCK_UPDATE } };
261                 LASSERT(mode != NULL);
262                 *mode = mds_lock_mode_for_dir(obd, de, lock_mode);
263                 if (*mode) {
264                         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
265                                               res_id, LDLM_IBITS,
266                                               &cpolicy, *mode, &flags,
267                                               mds_blocking_ast,
268                                               ldlm_completion_ast, NULL, NULL,
269                                               NULL, 0, NULL, lockh + 1);
270                         if (rc != ELDLM_OK) {
271                                 l_dput(de);
272                                 RETURN(ERR_PTR(-ENOLCK));
273                         }
274                 }
275                 flags = LDLM_FL_ATOMIC_CB;
276
277                 res_id.name[2] = full_name_hash((unsigned char *)name, namelen);
278
279                 CDEBUG(D_INFO, "take lock on "DLID4":"LPX64"\n",
280                        OLID4(id), res_id.name[2]);
281         }
282 #else
283 #warning "No PDIROPS support in the kernel"
284 #endif
285         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
286                               LDLM_IBITS, &policy, lock_mode, &flags,
287                               mds_blocking_ast, ldlm_completion_ast,
288                               NULL, NULL, NULL, 0, NULL, lockh);
289         if (rc != ELDLM_OK) {
290                 l_dput(de);
291                 retval = ERR_PTR(-EIO); /* XXX translate ldlm code */
292 #ifdef S_PDIROPS
293                 if (lockh[1].cookie)
294                         ldlm_lock_decref(lockh + 1, *mode);
295 #endif
296         } else if (de->d_inode && de->d_inode->i_nlink == 0) {
297                 /* as sometimes we lookup inode by ino/generation through
298                    iopen mechanism, it's possible to find already unlinked
299                    inode with nlink == 0. let's interpretate the case as
300                    ENOENT -bzzz */
301                 CWARN("found already unlinked inode %lu/%u\n",
302                       de->d_inode->i_ino, de->d_inode->i_generation);
303                 l_dput(de);
304                 retval = ERR_PTR(-ENOENT);
305                 ldlm_lock_decref(lockh, lock_mode);
306 #ifdef S_PDIROPS
307                 if (lockh[1].cookie)
308                         ldlm_lock_decref(lockh + 1, *mode);
309 #endif
310         }
311
312         RETURN(retval);
313 }
314
315 #ifndef DCACHE_DISCONNECTED
316 #define DCACHE_DISCONNECTED DCACHE_NFSD_DISCONNECTED
317 #endif
318
319 /* Look up an entry by inode number. This function ONLY returns valid dget'd
320  * dentries with an initialized inode or errors */
321 struct dentry *mds_id2dentry(struct obd_device *obd, struct lustre_id *id,
322                              struct vfsmount **mnt)
323 {
324         unsigned long ino = (unsigned long)id_ino(id);
325         __u32 generation = (__u32)id_gen(id);
326         struct mds_obd *mds = &obd->u.mds;
327         struct dentry *result;
328         struct inode *inode;
329         char idname[32];
330
331         if (ino == 0)
332                 RETURN(ERR_PTR(-ESTALE));
333
334         snprintf(idname, sizeof(idname), "0x%lx", ino);
335
336         CDEBUG(D_DENTRY, "--> mds_id2dentry: ino/gen %lu/%u, sb %p\n",
337                ino, generation, mds->mds_sb);
338
339         /* under ext3 this is neither supposed to return bad inodes nor NULL
340            inodes. */
341         result = ll_lookup_one_len(idname, mds->mds_id_de, 
342                                    strlen(idname));
343         if (IS_ERR(result))
344                 RETURN(result);
345
346         inode = result->d_inode;
347         if (!inode)
348                 RETURN(ERR_PTR(-ENOENT));
349
350         if (is_bad_inode(inode)) {
351                 CERROR("bad inode returned %lu/%u\n",
352                        inode->i_ino, inode->i_generation);
353                 dput(result);
354                 RETURN(ERR_PTR(-ENOENT));
355         }
356
357         /* here we disabled generation check, as root inode i_generation
358          * of cache mds and real mds are different. */
359         if (inode->i_ino != id_ino(&mds->mds_rootid) && generation &&
360             inode->i_generation != generation) {
361                 /* we didn't find the right inode.. */
362                 if (id_group(id) != mds->mds_num) {
363                         CERROR("bad inode %lu found, link: %lu, ct: %d, generation "
364                                "%u != %u, mds %u != %u, request to wrong MDS?\n",
365                                inode->i_ino, (unsigned long)inode->i_nlink,
366                                atomic_read(&inode->i_count), inode->i_generation,
367                                generation, mds->mds_num, (unsigned)id_group(id));
368                 } else {
369                         CERROR("bad inode %lu found, link: %lu, ct: %d, generation "
370                                "%u != %u, inode is recreated while request handled?\n",
371                                inode->i_ino, (unsigned long)inode->i_nlink,
372                                atomic_read(&inode->i_count), inode->i_generation,
373                                generation);
374                 }
375                 dput(result);
376                 RETURN(ERR_PTR(-ENOENT));
377         }
378
379         if (mnt) {
380                 *mnt = mds->mds_vfsmnt;
381                 mntget(*mnt);
382         }
383
384         RETURN(result);
385 }
386
387 static
388 int mds_req_add_idmapping(struct ptlrpc_request *req,
389                           struct mds_export_data *med)
390 {
391         struct mds_req_sec_desc *rsd;
392         struct lustre_sec_desc  *lsd;
393         int rc;
394
395         if (!med->med_remote)
396                 return 0;
397
398         /* maybe we should do it more completely: invalidate the gss ctxt? */
399         if (req->rq_mapped_uid == MDS_IDMAP_NOTFOUND) {
400                 CWARN("didn't find mapped uid\n");
401                 return -EPERM;
402         }
403
404         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
405         if (!rsd) {
406                 CERROR("Can't unpack security desc\n");
407                 return -EPROTO;
408         }
409
410         lsd = mds_get_lsd(req->rq_mapped_uid);
411         if (!lsd) {
412                 CERROR("can't get LSD(%u), no mapping added\n",
413                        req->rq_mapped_uid);
414                 return -EPERM;
415         }
416
417         rc = mds_idmap_add(med->med_idmap, rsd->rsd_uid, lsd->lsd_uid,
418                            rsd->rsd_gid, lsd->lsd_gid);
419         mds_put_lsd(lsd);
420         return rc;
421 }
422
423 static
424 int mds_req_del_idmapping(struct ptlrpc_request *req,
425                           struct mds_export_data *med)
426 {
427         struct mds_req_sec_desc *rsd;
428         struct lustre_sec_desc  *lsd;
429         int rc;
430
431         if (!med->med_remote)
432                 return 0;
433
434         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
435         if (!rsd) {
436                 CERROR("Can't unpack security desc\n");
437                 return -EPROTO;
438         }
439
440         LASSERT(req->rq_mapped_uid != -1);
441         lsd = mds_get_lsd(req->rq_mapped_uid);
442         if (!lsd) {
443                 CERROR("can't get LSD(%u), no idmapping deleted\n",
444                        req->rq_mapped_uid);
445                 return -EPERM;
446         }
447
448         rc = mds_idmap_del(med->med_idmap, rsd->rsd_uid, lsd->lsd_uid,
449                            rsd->rsd_gid, lsd->lsd_gid);
450         mds_put_lsd(lsd);
451         return rc;
452 }
453
454 static int mds_init_export_data(struct ptlrpc_request *req,
455                                 struct mds_export_data *med)
456 {
457         struct obd_connect_data *data, *reply;
458         int ask_remote, ask_local;
459         ENTRY;
460
461         data = lustre_msg_buf(req->rq_reqmsg, 5, sizeof(*data));
462         reply = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*data));
463         LASSERT(data && reply);
464
465         if (med->med_initialized) {
466                 CDEBUG(D_SEC, "med already initialized, reconnect?\n");
467                 goto reply;
468         }
469
470         ask_remote = data->ocd_connect_flags & OBD_CONNECT_REMOTE;
471         ask_local = data->ocd_connect_flags & OBD_CONNECT_LOCAL;
472
473         /* currently the policy is simple: satisfy client as possible
474          * as we can.
475          */
476         if (req->rq_auth_uid == -1) {
477                 if (ask_remote)
478                         CWARN("null sec is used, force to be local\n");
479                 med->med_remote = 0;
480         } else {
481                 if (ask_remote) {
482                         if (!req->rq_remote_realm)
483                                 CWARN("local realm asked to be remote\n");
484                         med->med_remote = 1;
485                 } else if (ask_local) {
486                         if (req->rq_remote_realm)
487                                 CWARN("remote realm asked to be local\n");
488                         med->med_remote = 0;
489                 } else
490                         med->med_remote = (req->rq_remote_realm != 0);
491         }
492
493         med->med_nllu = data->ocd_nllu[0];
494         med->med_nllg = data->ocd_nllu[1];
495
496         med->med_initialized = 1;
497 reply:
498         reply->ocd_connect_flags &= ~(OBD_CONNECT_REMOTE | OBD_CONNECT_LOCAL);
499         if (med->med_remote) {
500                 if (!med->med_idmap)
501                         med->med_idmap = mds_idmap_alloc();
502
503                 if (!med->med_idmap)
504                         CERROR("Failed to alloc idmap, following request from "
505                                "this client will be refused\n");
506
507                 reply->ocd_connect_flags |= OBD_CONNECT_REMOTE;
508                 CDEBUG(D_SEC, "set client as remote\n");
509         } else {
510                 reply->ocd_connect_flags |= OBD_CONNECT_LOCAL;
511                 CDEBUG(D_SEC, "set client as local\n");
512         }
513
514         RETURN(0);
515 }
516
517 static void mds_free_export_data(struct mds_export_data *med)
518 {
519         if (!med->med_idmap)
520                 return;
521
522         LASSERT(med->med_remote);
523         mds_idmap_free(med->med_idmap);
524         med->med_idmap = NULL;
525 }
526
527 /* Establish a connection to the MDS.
528  *
529  * This will set up an export structure for the client to hold state data about
530  * that client, like open files, the last operation number it did on the server,
531  * etc.
532  */
533 static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
534                        struct obd_uuid *cluuid, struct obd_connect_data *data,
535                        unsigned long flags)
536 {
537         struct mds_export_data *med;
538         struct mds_client_data *mcd;
539         struct obd_export *exp;
540         int rc;
541         ENTRY;
542
543         if (!conn || !obd || !cluuid)
544                 RETURN(-EINVAL);
545
546         /* XXX There is a small race between checking the list and adding a new
547          * connection for the same UUID, but the real threat (list corruption
548          * when multiple different clients connect) is solved.
549          *
550          * There is a second race between adding the export to the list, and
551          * filling in the client data below.  Hence skipping the case of NULL
552          * mcd above.  We should already be controlling multiple connects at the
553          * client, and we can't hold the spinlock over memory allocations
554          * without risk of deadlocking.
555          */
556         rc = class_connect(conn, obd, cluuid);
557         if (rc)
558                 RETURN(rc);
559         exp = class_conn2export(conn);
560         
561         LASSERT(exp != NULL);
562         med = &exp->exp_mds_data;
563
564         OBD_ALLOC(mcd, sizeof(*mcd));
565         if (!mcd) {
566                 CERROR("%s: out of memory for client data.\n",
567                         obd->obd_name);
568                 GOTO(out, rc = -ENOMEM);
569         }
570
571         memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid));
572         med->med_mcd = mcd;
573
574         rc = mds_client_add(obd, &obd->u.mds, med, -1);
575         if (rc)
576                 GOTO(out, rc);
577        
578         EXIT;
579 out:
580         if (rc) {
581                 if (mcd)
582                         OBD_FREE(mcd, sizeof(*mcd));
583                 class_disconnect(exp, 0);
584         } else {
585                 class_export_put(exp);
586         }
587         return rc;
588 }
589
590 static int mds_connect_post(struct obd_export *exp, unsigned initial,
591                             unsigned long flags)
592 {
593         struct obd_device *obd = exp->exp_obd;
594         struct mds_obd *mds = &obd->u.mds;
595         struct mds_export_data *med;
596         struct mds_client_data *mcd;
597         int rc = 0;
598         ENTRY;
599
600         med = &exp->exp_mds_data;
601         mcd = med->med_mcd;
602
603         if (initial) {
604                 /* some one reconnect initially, we have to reset
605                  * data existing export can have. bug 6102 */
606                 if (mcd->mcd_last_xid != 0)
607                         CDEBUG(D_HA, "initial reconnect to existing export\n");
608                 mcd->mcd_last_transno = 0;
609                 mcd->mcd_last_xid = 0;
610                 mcd->mcd_last_close_xid = 0;
611                 mcd->mcd_last_result = 0;
612                 mcd->mcd_last_data = 0;
613         }
614
615         if (!(flags & OBD_OPT_MDS_CONNECTION)) {
616                 if (!(exp->exp_flags & OBD_OPT_REAL_CLIENT)) {
617                         atomic_inc(&mds->mds_real_clients);
618                         CDEBUG(D_OTHER,"%s: peer from %s is real client (%d)\n",
619                                obd->obd_name, exp->exp_client_uuid.uuid,
620                                atomic_read(&mds->mds_real_clients));
621                         exp->exp_flags |= OBD_OPT_REAL_CLIENT;
622                 }
623                 if (mds->mds_md_name)
624                         rc = mds_md_connect(obd, mds->mds_md_name);
625         }
626         RETURN(rc);
627 }
628
629 static int mds_init_export(struct obd_export *exp)
630 {
631         struct mds_export_data *med = &exp->exp_mds_data;
632
633         INIT_LIST_HEAD(&med->med_open_head);
634         spin_lock_init(&med->med_open_lock);
635         return 0;
636 }
637
638 static int mds_destroy_export(struct obd_export *export)
639 {
640         struct obd_device *obd = export->exp_obd;
641         struct mds_export_data *med = &export->exp_mds_data;
642         struct lvfs_run_ctxt saved;
643         int rc = 0;
644         ENTRY;
645
646         mds_free_export_data(med);
647         target_destroy_export(export);
648
649         if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
650                 GOTO(out, 0);
651
652         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
653
654         /* Close any open files (which may also cause orphan unlinking). */
655         spin_lock(&med->med_open_lock);
656         while (!list_empty(&med->med_open_head)) {
657                 struct list_head *tmp = med->med_open_head.next;
658                 struct mds_file_data *mfd =
659                         list_entry(tmp, struct mds_file_data, mfd_list);
660                 struct lustre_id sid;
661                 
662                 BDEVNAME_DECLARE_STORAGE(btmp);
663
664                 /* bug 1579: fix force-closing for 2.5 */
665                 struct dentry *dentry = mfd->mfd_dentry;
666
667                 list_del(&mfd->mfd_list);
668                 spin_unlock(&med->med_open_lock);
669
670                 down(&dentry->d_inode->i_sem);
671                 rc = mds_read_inode_sid(obd, dentry->d_inode, &sid);
672                 up(&dentry->d_inode->i_sem);
673                 if (rc) {
674                         CERROR("Can't read inode self id, inode %lu, "
675                                "rc %d\n", dentry->d_inode->i_ino, rc);
676                         memset(&sid, 0, sizeof(sid));
677                 }
678
679                 /* If you change this message, be sure to update
680                  * replay_single:test_46 */
681                 CERROR("force closing client file handle for %.*s (%s:"
682                        DLID4")\n", dentry->d_name.len, dentry->d_name.name,
683                        ll_bdevname(dentry->d_inode->i_sb, btmp),
684                        OLID4(&sid));
685                 
686                 /* child inode->i_alloc_sem protects orphan_dec_test and
687                  * is_orphan race, mds_mfd_close drops it */
688                 DOWN_WRITE_I_ALLOC_SEM(dentry->d_inode);
689                 rc = mds_mfd_close(NULL, 0, obd, mfd,
690                                    !(export->exp_flags & OBD_OPT_FAILOVER));
691                 if (rc)
692                         CDEBUG(D_INODE, "Error closing file: %d\n", rc);
693                 spin_lock(&med->med_open_lock);
694         }
695         spin_unlock(&med->med_open_lock);
696         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
697
698         EXIT;
699 out:
700         mds_client_free(export, !(export->exp_flags & OBD_OPT_FAILOVER));
701         return rc;
702 }
703
704 static int mds_disconnect(struct obd_export *exp, unsigned long flags)
705 {
706         unsigned long irqflags;
707         struct obd_device *obd;
708         struct mds_obd *mds;
709         int rc;
710         ENTRY;
711
712         LASSERT(exp != NULL);
713         obd = class_exp2obd(exp);
714         if (obd == NULL) {
715                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
716                        exp->exp_handle.h_cookie);
717                 RETURN(-EINVAL);
718         }
719         mds = &obd->u.mds;
720
721         /*
722          * suppress any inter-mds requests durring disconnecting lmv if this is
723          * detected --force mode. This is needed to avoid endless recovery.
724          */
725         if (atomic_read(&mds->mds_real_clients) > 0 &&
726             !(exp->exp_flags & OBD_OPT_REAL_CLIENT))
727                 flags |= OBD_OPT_FORCE;
728                                                                                               
729         if (!(exp->exp_flags & OBD_OPT_REAL_CLIENT)
730             && !atomic_read(&mds->mds_real_clients)) {
731                 /* there was no client at all */
732                 mds_md_disconnect(obd, flags);
733         }
734
735         if ((exp->exp_flags & OBD_OPT_REAL_CLIENT)
736             && atomic_dec_and_test(&mds->mds_real_clients)) {
737                 /* time to drop LMV connections */
738                 CDEBUG(D_OTHER, "%s: last real client %s disconnected.  "
739                        "Disconnnect from LMV now\n",
740                        obd->obd_name, exp->exp_client_uuid.uuid);
741                 mds_md_disconnect(obd, flags);
742         }
743
744         spin_lock_irqsave(&exp->exp_lock, irqflags);
745         exp->exp_flags = flags;
746         spin_unlock_irqrestore(&exp->exp_lock, irqflags);
747
748         /* disconnect early so that clients can't keep using export */
749         rc = class_disconnect(exp, flags);
750         ldlm_cancel_locks_for_export(exp);
751
752         /* complete all outstanding replies */
753         spin_lock_irqsave(&exp->exp_lock, irqflags);
754         while (!list_empty(&exp->exp_outstanding_replies)) {
755                 struct ptlrpc_reply_state *rs =
756                         list_entry(exp->exp_outstanding_replies.next,
757                                    struct ptlrpc_reply_state, rs_exp_list);
758                 struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
759
760                 spin_lock(&svc->srv_lock);
761                 list_del_init(&rs->rs_exp_list);
762                 ptlrpc_schedule_difficult_reply(rs);
763                 spin_unlock(&svc->srv_lock);
764         }
765         spin_unlock_irqrestore(&exp->exp_lock, irqflags);
766         RETURN(rc);
767 }
768
769 static int mds_getstatus(struct ptlrpc_request *req)
770 {
771         struct mds_obd *mds = mds_req2mds(req);
772         struct mds_body *body;
773         int rc, size;
774         ENTRY;
775
776         size = sizeof(*body);
777         
778         rc = lustre_pack_reply(req, 1, &size, NULL);
779         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) {
780                 CERROR("mds: out of memory for message: size=%d\n", size);
781                 req->rq_status = -ENOMEM;       /* superfluous? */
782                 RETURN(-ENOMEM);
783         }
784
785         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
786         body->valid |= OBD_MD_FID;
787         
788         memcpy(&body->id1, &mds->mds_rootid, sizeof(body->id1));
789
790         /*
791          * the last_committed and last_xid fields are filled in for all replies
792          * already - no need to do so here also.
793          */
794         RETURN(0);
795 }
796
797 int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
798                      void *data, int flag)
799 {
800         int do_ast;
801         ENTRY;
802
803         if (flag == LDLM_CB_CANCELING) {
804                 /* Don't need to do anything here. */
805                 RETURN(0);
806         }
807
808         /* XXX layering violation!  -phil */
809         lock_res_and_lock(lock);
810         
811         /*
812          * get this: if mds_blocking_ast is racing with mds_intent_policy, such
813          * that mds_blocking_ast is called just before l_i_p takes the ns_lock,
814          * then by the time we get the lock, we might not be the correct
815          * blocking function anymore.  So check, and return early, if so.
816          */
817         if (lock->l_blocking_ast != mds_blocking_ast) {
818                 unlock_res_and_lock(lock);
819                 RETURN(0);
820         }
821
822         lock->l_flags |= LDLM_FL_CBPENDING;
823         do_ast = (!lock->l_readers && !lock->l_writers);
824         unlock_res_and_lock(lock);
825
826         if (do_ast) {
827                 struct lustre_handle lockh;
828                 int rc;
829
830                 LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
831                 ldlm_lock2handle(lock, &lockh);
832                 rc = ldlm_cli_cancel(&lockh);
833                 if (rc < 0)
834                         CERROR("ldlm_cli_cancel: %d\n", rc);
835         } else {
836                 LDLM_DEBUG(lock, "Lock still has references, will be "
837                            "cancelled later");
838         }
839         RETURN(0);
840 }
841
842 static int mds_convert_md(struct obd_device *obd, struct inode *inode,
843                           void *md, int size, int mea)
844 {
845         int rc = size;
846         
847         if (S_ISREG(inode->i_mode)) {
848                 rc = mds_convert_lov_ea(obd, inode, md, size);
849         } else if (S_ISDIR(inode->i_mode)) {
850                 if (mea) {
851                         rc = mds_convert_mea_ea(obd, inode, md, size);
852                 } else {
853                         rc = mds_convert_lov_ea(obd, inode, md, size);
854                 }
855                 if (rc == -EINVAL) {
856                         CERROR("Invalid EA format (nor LOV or MEA) "
857                                "is detected. Inode %lu/%u\n",
858                                inode->i_ino, inode->i_generation);
859                 }
860         }
861         return rc;
862 }
863
864 int mds_get_md(struct obd_device *obd, struct inode *inode,
865                void *md, int *size, int lock, int mea)
866 {
867         int lmm_size;
868         int rc = 0;
869         ENTRY;
870
871         if (lock)
872                 down(&inode->i_sem);
873
874         rc = fsfilt_get_md(obd, inode, md, *size,
875                            (mea ? EA_MEA : EA_LOV));
876         if (rc < 0) {
877                 CERROR("Error %d reading eadata for ino %lu\n",
878                        rc, inode->i_ino);
879         } else if (rc > 0) {
880                 lmm_size = rc;
881                 rc = mds_convert_md(obd, inode, md,
882                                     lmm_size, mea);
883                 if (rc == 0) {
884                         *size = lmm_size;
885                         rc = lmm_size;
886                 } else if (rc > 0) {
887                         *size = rc;
888                 }
889         }
890         if (lock)
891                 up(&inode->i_sem);
892
893         RETURN(rc);
894 }
895
896 /* Call with lock=1 if you want mds_pack_md to take the i_sem.
897  * Call with lock=0 if the caller has already taken the i_sem. */
898 int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
899                 struct mds_body *body, struct inode *inode, int lock, int mea)
900 {
901         struct mds_obd *mds = &obd->u.mds;
902         int rc, lmm_size;
903         void *lmm;
904         ENTRY;
905
906         lmm = lustre_msg_buf(msg, offset, 0);
907         if (lmm == NULL) {
908                 /* Some problem with getting eadata when I sized the reply
909                  * buffer... */
910                 CDEBUG(D_INFO, "no space reserved for inode %lu MD\n",
911                        inode->i_ino);
912                 RETURN(0);
913         }
914         lmm_size = msg->buflens[offset];
915
916         /* I don't really like this, but it is a sanity check on the client
917          * MD request.  However, if the client doesn't know how much space
918          * to reserve for the MD, it shouldn't be bad to have too much space.
919          */
920         if (lmm_size > mds->mds_max_mdsize) {
921                 CWARN("Reading MD for inode %lu of %d bytes > max %d\n",
922                        inode->i_ino, lmm_size, mds->mds_max_mdsize);
923                 // RETURN(-EINVAL);
924         }
925
926         rc = mds_get_md(obd, inode, lmm, &lmm_size, lock, mea);
927         if (rc > 0) {
928                 body->valid |= S_ISDIR(inode->i_mode) ?
929                         OBD_MD_FLDIREA : OBD_MD_FLEASIZE;
930                 
931                 if (mea)
932                         body->valid |= OBD_MD_MEA;
933                 
934                 body->eadatasize = lmm_size;
935                 rc = 0;
936         }
937
938         RETURN(rc);
939 }
940
941 int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
942                   struct mds_body *repbody, int reply_off)
943 {
944         struct inode *inode = dentry->d_inode;
945         char *symname;
946         int len, rc;
947         ENTRY;
948
949         symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0);
950         LASSERT(symname != NULL);
951         len = req->rq_repmsg->buflens[reply_off + 1];
952         
953         rc = inode->i_op->readlink(dentry, symname, len);
954         if (rc < 0) {
955                 CERROR("readlink failed: %d\n", rc);
956         } else if (rc != len - 1) {
957                 CERROR ("Unexpected readlink rc %d: expecting %d\n",
958                         rc, len - 1);
959                 rc = -EINVAL;
960         } else {
961                 CDEBUG(D_INODE, "read symlink dest %s\n", symname);
962                 repbody->valid |= OBD_MD_LINKNAME;
963                 repbody->eadatasize = rc + 1;
964                 symname[rc] = 0;        /* NULL terminate */
965                 rc = 0;
966         }
967
968         RETURN(rc);
969 }
970
971 int mds_pack_xattr(struct dentry *dentry, struct ptlrpc_request *req,
972                    struct mds_body *repbody, int req_off, int reply_off)
973 {
974         struct inode *inode = dentry->d_inode;
975         char *ea_name;
976         void *value = NULL;
977         int len, rc;
978         ENTRY;
979
980         ea_name = lustre_msg_string(req->rq_reqmsg, req_off + 1, 0);
981         len = req->rq_repmsg->buflens[reply_off + 1];
982         if (len != 0)
983                 value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
984
985         rc = -EOPNOTSUPP;
986
987         if (!strcmp(ea_name, XATTR_NAME_LUSTRE_ACL)) {
988                 struct rmtacl_upcall_desc desc;
989
990                 if (len != LUSTRE_ACL_SIZE_MAX || !value) {
991                         CERROR("no reply buffer prepared\n");
992                         RETURN(-EFAULT);
993                 }
994
995                 memset(&desc, 0, sizeof(desc));
996                 desc.get = 1;
997                 desc.cmd = lustre_msg_string(req->rq_reqmsg, req_off + 2, 0);
998                 desc.cmdlen =  req->rq_reqmsg->buflens[req_off + 2];
999                 desc.res = (char *) value;
1000                 desc.reslen = LUSTRE_ACL_SIZE_MAX;
1001
1002                 mds_do_remote_acl_upcall(&desc);
1003
1004                 if (desc.upcall_status)
1005                         RETURN(desc.upcall_status);
1006
1007                 if (desc.reslen > LUSTRE_ACL_SIZE_MAX) {
1008                         CERROR("downcall claim reslen %u\n", desc.reslen);
1009                         RETURN(-EINVAL);
1010                 }
1011                 /* like remote setfacl, steal "flags" in mds_body as the
1012                  * exececution status
1013                  */
1014                 repbody->flags = desc.status;
1015                 repbody->valid |= OBD_MD_FLXATTR;
1016                 repbody->eadatasize = desc.reslen;
1017
1018                 RETURN(0);
1019         }
1020
1021         if (inode->i_op && inode->i_op->getxattr)
1022                 rc = inode->i_op->getxattr(dentry, ea_name, value, len);
1023
1024         if (rc < 0) {
1025                 if (rc != -ENODATA && rc != -EOPNOTSUPP)
1026                         CERROR("getxattr failed: %d", rc);
1027         } else {
1028                 repbody->valid |= OBD_MD_FLXATTR;
1029                 repbody->eadatasize = rc;
1030                 rc = 0;
1031         }
1032
1033         RETURN(rc);
1034 }
1035
1036 int mds_pack_xattr_list(struct dentry *dentry, struct ptlrpc_request *req,
1037                         struct mds_body *repbody, int reply_off)
1038 {
1039         struct inode *inode = dentry->d_inode;        
1040         void *value = NULL;
1041         int len, rc;
1042         ENTRY;
1043
1044         len = req->rq_repmsg->buflens[reply_off + 1];
1045         if (len != 0)
1046                 value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
1047
1048         rc = -EOPNOTSUPP;
1049         if (inode->i_op && inode->i_op->getxattr) 
1050                 rc = inode->i_op->listxattr(dentry, value, len);
1051
1052         if (rc < 0) {
1053                 CERROR("listxattr failed: %d", rc);
1054         } else {
1055                 repbody->valid |= OBD_MD_FLXATTRLIST;
1056                 repbody->eadatasize = rc;
1057                 rc = 0;
1058         }
1059         RETURN(rc);
1060 }
1061
1062 static
1063 int mds_pack_posix_acl(struct lustre_msg *repmsg, int *offset,
1064                        struct mds_body *body, struct inode *inode)
1065 {
1066         struct dentry de = { .d_inode = inode };
1067         __u32 buflen, *sizep;
1068         void *buf;
1069         int size, pack_off = *offset;
1070         ENTRY;
1071
1072         sizep = lustre_msg_buf(repmsg, pack_off++, 4);
1073         if (!sizep) {
1074                 CERROR("can't locate returned acl size buf\n");
1075                 RETURN(-EPROTO);
1076         }
1077         
1078         if (!inode->i_op->getxattr)
1079                 RETURN(0);
1080
1081         buflen = repmsg->buflens[pack_off];
1082         buf = lustre_msg_buf(repmsg, pack_off++, buflen);
1083
1084         size = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS, buf, buflen);
1085         if (size == -ENODATA || size == -EOPNOTSUPP)
1086                 RETURN(0);
1087         if (size < 0)
1088                 RETURN(size);
1089         LASSERT(size);
1090         body->valid |= OBD_MD_FLACL;
1091
1092         *sizep = cpu_to_le32(size);
1093         
1094         *offset = pack_off;
1095         RETURN(0);
1096 }
1097
1098 int mds_pack_remote_perm(struct ptlrpc_request *req, int *reply_off,
1099                          struct mds_body *body, struct inode *inode)
1100 {
1101         struct lustre_sec_desc *lsd;
1102         struct mds_remote_perm *perm;
1103         int pack_off = *reply_off;
1104         __u32 lsd_perms;
1105
1106         LASSERT(inode->i_op);
1107         LASSERT(inode->i_op->permission);
1108         LASSERT(req->rq_export->exp_mds_data.med_remote);
1109
1110         perm = (struct mds_remote_perm *)
1111                        lustre_msg_buf(req->rq_repmsg, pack_off++, sizeof(perm));
1112         if (!perm)
1113                 return -EINVAL;
1114
1115         memset(perm, 0, sizeof(*perm));
1116
1117         /* obtain authenticated uid/gid and LSD permissions, which
1118          * might be different from current process context, from LSD
1119          */
1120         lsd = mds_get_lsd(current->uid);
1121         if (!lsd) {
1122                 CWARN("can't LSD of uid %u\n", current->uid);
1123                 RETURN(-EPERM);
1124         }
1125
1126         perm->mrp_auth_uid = lsd->lsd_uid;
1127         perm->mrp_auth_gid = lsd->lsd_gid;
1128
1129         lsd_perms = mds_lsd_get_perms(lsd, 1, 0, req->rq_peer.peer_id.nid);
1130         if (lsd_perms & LSD_PERM_SETUID)
1131                 perm->mrp_allow_setuid = 1;
1132         if (lsd_perms & LSD_PERM_SETGID)
1133                 perm->mrp_allow_setgid = 1;
1134
1135         mds_put_lsd(lsd);
1136
1137         /* permission bits of current user
1138          * XXX this is low efficient, could we do it in one blow?
1139          */
1140         if (inode->i_op->permission(inode, MAY_EXEC, NULL) == 0)
1141                 perm->mrp_perm |= MAY_EXEC;
1142         if (inode->i_op->permission(inode, MAY_WRITE, NULL) == 0)
1143                 perm->mrp_perm |= MAY_WRITE;
1144         if (inode->i_op->permission(inode, MAY_READ, NULL) == 0)
1145                 perm->mrp_perm |= MAY_READ;
1146
1147         body->valid |= (OBD_MD_FLACL | OBD_MD_FLRMTACL);
1148         
1149         *reply_off = pack_off;
1150
1151         RETURN(0);
1152 }
1153
1154 int mds_pack_acl(struct ptlrpc_request *req, int *reply_off,
1155                  struct mds_body *body, struct inode *inode)
1156 {
1157         int rc;
1158
1159         if (!req->rq_export->exp_mds_data.med_remote)
1160                 rc = mds_pack_posix_acl(req->rq_repmsg, reply_off, body, inode);
1161         else
1162                 rc = mds_pack_remote_perm(req, reply_off, body, inode);
1163
1164         return rc;
1165 }
1166
1167 static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
1168                                 struct ptlrpc_request *req, int req_off,
1169                                 struct mds_body *reqbody, int reply_off,
1170                                 struct mds_req_sec_desc *rsd)
1171 {
1172         struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
1173         struct inode *inode = dentry->d_inode;
1174         struct mds_body *body;
1175         int rc = 0, offset = 0;
1176         ENTRY;
1177
1178         if (inode == NULL && !(dentry->d_flags & DCACHE_CROSS_REF))
1179                 RETURN(-ENOENT);
1180
1181         body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof(*body));
1182         LASSERT(body != NULL);                 /* caller prepped reply */
1183
1184         if (dentry->d_flags & DCACHE_CROSS_REF) {
1185                 mds_pack_dentry2body(obd, body, dentry,
1186                                      (reqbody->valid & OBD_MD_FID) ? 1 : 0);
1187                 CDEBUG(D_OTHER, "cross reference: "DLID4"\n",
1188                        OLID4(&body->id1));
1189                 RETURN(0);
1190         }
1191         
1192         mds_pack_inode2body(obd, body, inode,
1193                             (reqbody->valid & OBD_MD_FID) ? 1 : 0);
1194
1195         if ((S_ISREG(inode->i_mode) && (reqbody->valid & OBD_MD_FLEASIZE)) ||
1196             (S_ISDIR(inode->i_mode) && (reqbody->valid & OBD_MD_FLDIREA))) {
1197             
1198                 /* guessing what kind og attribute do we need. */
1199                 int is_mea = (S_ISDIR(inode->i_mode) && 
1200                     (reqbody->valid & OBD_MD_MEA) != 0);
1201                 
1202                 rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1, 
1203                                  body, inode, 1, is_mea);
1204
1205                 /* if we have LOV EA data, the OST holds size, atime, mtime. */
1206                 if (!(body->valid & OBD_MD_FLEASIZE) &&
1207                     !(body->valid & OBD_MD_FLDIREA))
1208                         body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
1209                                         OBD_MD_FLATIME | OBD_MD_FLMTIME);
1210         } else if (S_ISLNK(inode->i_mode) &&
1211                    (reqbody->valid & OBD_MD_LINKNAME) != 0) {
1212                 rc = mds_pack_link(dentry, req, body, reply_off);
1213         } else if (reqbody->valid & OBD_MD_FLXATTR) {
1214                 rc = mds_pack_xattr(dentry, req, body, req_off, reply_off);
1215         } else if (reqbody->valid & OBD_MD_FLXATTRLIST) {
1216                 rc = mds_pack_xattr_list(dentry, req, body, reply_off);
1217         }
1218         
1219         offset = reply_off + ((reqbody->valid & OBD_MD_FLEASIZE) ? 2 : 1);
1220         if (reqbody->valid & OBD_MD_FLACL) {
1221                 rc = mds_pack_acl(req, &offset, body, inode);
1222         }                
1223
1224         if (reqbody->valid & OBD_MD_FLKEY) {
1225                 rc = mds_pack_gskey(obd, req->rq_repmsg, &offset, 
1226                                     body, inode);
1227         }                
1228
1229         if (rc == 0)
1230                 mds_body_do_reverse_map(med, body);
1231
1232         RETURN(rc);
1233 }
1234
1235 static int mds_getattr_pack_msg_cf(struct ptlrpc_request *req,
1236                                    struct dentry *dentry,
1237                                    int offset)
1238 {
1239         int rc = 0, size[1] = {sizeof(struct mds_body)};
1240         ENTRY;
1241
1242         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
1243                 CERROR("failed MDS_GETATTR_PACK test\n");
1244                 req->rq_status = -ENOMEM;
1245                 RETURN(-ENOMEM);
1246         }
1247
1248         rc = lustre_pack_reply(req, 1, size, NULL);
1249         if (rc) {
1250                 CERROR("lustre_pack_reply failed: rc %d\n", rc);
1251                 GOTO(out, req->rq_status = rc);
1252         }
1253
1254         EXIT;
1255 out:
1256         return rc;
1257 }
1258
1259 static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct dentry *de,
1260                                 int offset)
1261 {
1262         struct inode *inode = de->d_inode;
1263         struct mds_obd *mds = mds_req2mds(req);
1264         struct mds_body *body;
1265         int rc = 0, size[4] = {sizeof(*body)}, bufcount = 1;
1266         ENTRY;
1267
1268         body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*body));
1269         LASSERT(body != NULL);                 /* checked by caller */
1270         LASSERT_REQSWABBED(req, offset);       /* swabbed by caller */
1271
1272         if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
1273             (S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
1274                 int rc;
1275                 
1276                 down(&inode->i_sem);
1277                 rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0,
1278                                    ((body->valid & OBD_MD_MEA) ? EA_MEA : EA_LOV));
1279                 up(&inode->i_sem);
1280                 if (rc < 0) {
1281                         if (rc != -ENODATA && rc != -EOPNOTSUPP)
1282                                 CERROR("error getting inode %lu MD: rc = %d\n",
1283                                        inode->i_ino, rc);
1284                         size[bufcount] = 0;
1285                 } else if (rc > mds->mds_max_mdsize) {
1286                         size[bufcount] = 0;
1287                         CERROR("MD size %d larger than maximum possible %u\n",
1288                                rc, mds->mds_max_mdsize);
1289                 } else {
1290                         size[bufcount] = rc;
1291                 }
1292                 bufcount++;
1293         } else if (S_ISLNK(inode->i_mode) && (body->valid & OBD_MD_LINKNAME)) {
1294                 if (inode->i_size + 1 != body->eadatasize)
1295                         CERROR("symlink size: %Lu, reply space: %d\n",
1296                                inode->i_size + 1, body->eadatasize);
1297                 size[bufcount] = min_t(int, inode->i_size+1, body->eadatasize);
1298                 bufcount++;
1299                 CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n",
1300                        inode->i_size + 1, body->eadatasize);
1301         } else if ((body->valid & OBD_MD_FLXATTR)) {
1302                 char *ea_name = lustre_msg_string(req->rq_reqmsg, 
1303                                                   offset + 1, 0);
1304                 rc = -EOPNOTSUPP;
1305
1306                 if (!strcmp(ea_name, XATTR_NAME_LUSTRE_ACL)) {
1307                         size[bufcount] = LUSTRE_ACL_SIZE_MAX;
1308                 } else {
1309                         if (inode->i_op && inode->i_op->getxattr)
1310                                 rc = inode->i_op->getxattr(de, ea_name,
1311                                                            NULL, 0);
1312
1313                         if (rc < 0) {
1314                                 if (rc != -ENODATA && rc != -EOPNOTSUPP)
1315                                         CERROR("error get inode %lu EA: %d\n",
1316                                                inode->i_ino, rc);
1317                                 size[bufcount] = 0;
1318                         } else {
1319                                 size[bufcount] = min_t(int,
1320                                                        body->eadatasize, rc);
1321                         }
1322                 }
1323                 bufcount++;
1324         } else if (body->valid & OBD_MD_FLXATTRLIST) {
1325                 rc = -EOPNOTSUPP;
1326                 if (inode->i_op && inode->i_op->getxattr) 
1327                         rc = inode->i_op->listxattr(de, NULL, 0);
1328
1329                 if (rc < 0) {
1330                         if (rc != -ENODATA && rc != -EOPNOTSUPP)
1331                                 CERROR("error getting inode %lu EA: rc = %d\n",
1332                                        inode->i_ino, rc);
1333                         size[bufcount] = 0;
1334                 } else {
1335                         size[bufcount] = min_t(int, body->eadatasize, rc);
1336                 }
1337                 bufcount++;
1338         }
1339         
1340         /* may co-exist with OBD_MD_FLEASIZE */
1341         if (body->valid & OBD_MD_FLACL) {
1342                 if (req->rq_export->exp_mds_data.med_remote) {
1343                         size[bufcount++] = sizeof(struct mds_remote_perm);
1344                 } else {
1345                         size[bufcount++] = sizeof(int);
1346                         size[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
1347                 }
1348         }
1349
1350         if (body->valid & OBD_MD_FLKEY) {
1351                 size[bufcount++] = sizeof(int);
1352                 size[bufcount++] = sizeof(struct crypto_key);
1353         }
1354
1355         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
1356                 CERROR("failed MDS_GETATTR_PACK test\n");
1357                 req->rq_status = -ENOMEM;
1358                 GOTO(out, rc = -ENOMEM);
1359         }
1360
1361         rc = lustre_pack_reply(req, bufcount, size, NULL);
1362         if (rc) {
1363                 CERROR("out of memory\n");
1364                 GOTO(out, req->rq_status = rc);
1365         }
1366
1367         EXIT;
1368  out:
1369         return rc;
1370 }
1371
1372 int mds_check_mds_num(struct obd_device *obd, struct inode *inode,
1373                       char *name, int namelen)
1374 {
1375         struct mea *mea = NULL;
1376         int mea_size, rc = 0;
1377         ENTRY;
1378         
1379         rc = mds_md_get_attr(obd, inode, &mea, &mea_size);
1380         if (rc)
1381                 RETURN(rc);
1382         if (mea != NULL) {
1383                 /*
1384                  * dir is already splitted, check if requested filename should
1385                  * live at this MDS or at another one.
1386                  */
1387                 int i = mea_name2idx(mea, name, namelen - 1);
1388                 if (mea->mea_master != id_group(&mea->mea_ids[i])) {
1389                         CDEBUG(D_OTHER,
1390                                "inapropriate MDS(%d) for %s. should be "
1391                                "%lu(%d)\n", mea->mea_master, name, 
1392                                (unsigned long)id_group(&mea->mea_ids[i]), i);
1393                         rc = -ERESTART;
1394                 }
1395         }
1396
1397         if (mea)
1398                 OBD_FREE(mea, mea_size);
1399         RETURN(rc);
1400 }
1401
1402 int mds_getattr_size(struct obd_device *obd, struct dentry *dentry,
1403                      struct ptlrpc_request *req, struct mds_body *body)
1404 {
1405         struct inode *inode = dentry->d_inode;
1406         ENTRY;
1407
1408         LASSERT(body != NULL);
1409
1410         if (dentry->d_inode == NULL || !S_ISREG(inode->i_mode))
1411                 RETURN(0);
1412         
1413         if (obd->obd_recovering) {
1414                 CDEBUG(D_INODE, "size for "DLID4" is unknown yet (recovering)\n",
1415                        OLID4(&body->id1));
1416                 RETURN(0);
1417         }
1418
1419         if (atomic_read(&inode->i_writecount)) {
1420                 /* some one has opened the file for write.
1421                  * mds doesn't know actual size */
1422                 CDEBUG(D_INODE, "MDS doesn't know actual size for "DLID4"\n",
1423                        OLID4(&body->id1));
1424                 RETURN(0);
1425         }
1426         CDEBUG(D_INODE, "MDS returns "LPD64"/"LPD64" for"DLID4"\n",
1427                body->size, body->blocks, OLID4(&body->id1));
1428         body->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
1429         RETURN(0);
1430 }
1431
1432 static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
1433                             struct lustre_handle *child_lockh, int child_part)
1434 {
1435         struct obd_device *obd = req->rq_export->exp_obd;
1436         struct mds_obd *mds = &obd->u.mds;
1437         struct ldlm_reply *rep = NULL;
1438         struct lvfs_run_ctxt saved;
1439         struct mds_req_sec_desc *rsd;
1440         struct mds_body *body;
1441         struct dentry *dparent = NULL, *dchild = NULL;
1442         struct lvfs_ucred uc = {NULL, NULL,};
1443         struct lustre_handle parent_lockh[2] = {{0}, {0}};
1444         unsigned int namesize;
1445         int rc = 0, cleanup_phase = 0, resent_req = 0, update_mode, reply_offset;
1446         char *name = NULL;
1447         ENTRY;
1448
1449         LASSERT(!strcmp(obd->obd_type->typ_name, OBD_MDS_DEVICENAME));
1450         MD_COUNTER_INCREMENT(obd, getattr_lock);
1451
1452         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1453         if (!rsd) {
1454                 CERROR("Can't unpack security desc\n");
1455                 RETURN(-EFAULT);
1456         }
1457
1458         /* swab now, before anyone looks inside the request. */
1459         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1460                                   lustre_swab_mds_body);
1461         if (body == NULL) {
1462                 CERROR("Can't swab mds_body\n");
1463                 GOTO(cleanup, rc = -EFAULT);
1464         }
1465
1466         LASSERT_REQSWAB(req, offset + 1);
1467         name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0);
1468         if (name == NULL) {
1469                 CERROR("Can't unpack name\n");
1470                 GOTO(cleanup, rc = -EFAULT);
1471         }
1472         namesize = req->rq_reqmsg->buflens[offset + 1];
1473
1474         /* namesize less than 2 means we have empty name, probably came from
1475            revalidate by cfid, so no point in having name to be set */
1476         if (namesize <= 1)
1477                 name = NULL;
1478
1479         LASSERT (offset == 1 || offset == 3);
1480         if (offset == 3) {
1481                 rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
1482                 reply_offset = 1;
1483         } else {
1484                 reply_offset = 0;
1485         }
1486
1487         rc = mds_init_ucred(&uc, req, rsd);
1488         if (rc) {
1489                 GOTO(cleanup, rc);
1490         }
1491
1492         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1493         cleanup_phase = 1; /* kernel context */
1494         intent_set_disposition(rep, DISP_LOOKUP_EXECD);
1495
1496         LASSERT(namesize > 0);
1497         if (child_lockh->cookie != 0) {
1498                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
1499                 resent_req = 1;
1500         }
1501 #if HAVE_LOOKUP_RAW
1502         if (body->valid == OBD_MD_FLID) {
1503                 struct mds_body *mds_reply;
1504                 int size = sizeof(*mds_reply);
1505                 struct inode *dir;
1506                 ino_t inum;
1507
1508                 dparent = mds_id2dentry(obd, &body->id1, NULL);
1509                 if (IS_ERR(dparent)) {
1510                         rc = PTR_ERR(dparent);
1511                         GOTO(cleanup, rc);
1512                 }
1513                 /*
1514                  * the user requested ONLY the inode number, so do a raw lookup.
1515                  */
1516                 rc = lustre_pack_reply(req, 1, &size, NULL);
1517                 if (rc) {
1518                         CERROR("out of memory\n");
1519                         l_dput(dparent);
1520                         GOTO(cleanup, rc);
1521                 }
1522                 dir  = dparent->d_inode;
1523                 LASSERT(dir->i_op->lookup_raw != NULL);
1524                 rc = dir->i_op->lookup_raw(dir, name, namesize - 1, &inum);
1525                 l_dput(dparent);
1526                 mds_reply = lustre_msg_buf(req->rq_repmsg, 0,
1527                                            sizeof(*mds_reply));
1528
1529                 id_ino(&mds_reply->id1) = inum;
1530                 mds_reply->valid = OBD_MD_FLID;
1531                 GOTO(cleanup, rc);
1532         }
1533 #endif
1534         if (resent_req == 0) {
1535                 LASSERT(id_fid(&body->id1) != 0);
1536                 if (name) {
1537                         rc = mds_get_parent_child_locked(obd, mds, &body->id1,
1538                                                          parent_lockh, &dparent,
1539                                                          LCK_PR, 
1540                                                          MDS_INODELOCK_UPDATE,
1541                                                          &update_mode, 
1542                                                          name, namesize,
1543                                                          child_lockh, &dchild, 
1544                                                          LCK_PR, child_part);
1545                         if (rc)
1546                                 GOTO(cleanup, rc);
1547                 
1548                         cleanup_phase = 2; /* dchild, dparent, locks */
1549                         
1550                         /*
1551                          * let's make sure this name should leave on this mds
1552                          * node.
1553                          */
1554                         rc = mds_check_mds_num(obd, dparent->d_inode, name, namesize);
1555                         if (rc)
1556                                 GOTO(cleanup, rc);
1557                 } else {
1558                         /* we have no dentry here, drop LOOKUP bit */
1559                         /* FIXME: we need MDS_INODELOCK_LOOKUP or not. */
1560                         child_part &= ~MDS_INODELOCK_LOOKUP;
1561                         CDEBUG(D_OTHER, "%s: retrieve attrs for "DLID4"\n",
1562                                obd->obd_name, OLID4(&body->id1));
1563
1564                         dchild = mds_id2locked_dentry(obd, &body->id1, NULL,
1565                                                       LCK_PR, parent_lockh,
1566                                                       &update_mode,
1567                                                       NULL, 0, 
1568                                                       MDS_INODELOCK_UPDATE);
1569                         if (IS_ERR(dchild)) {
1570                                 CERROR("can't find inode with id "DLID4", err = %d\n", 
1571                                        OLID4(&body->id1), (int)PTR_ERR(dchild));
1572                                 GOTO(cleanup, rc = PTR_ERR(dchild));
1573                         }
1574                         memcpy(child_lockh, parent_lockh, sizeof(parent_lockh[0]));
1575                 }
1576         } else {
1577                 struct ldlm_lock *granted_lock;
1578
1579                 DEBUG_REQ(D_DLMTRACE, req, "resent, not enqueuing new locks");
1580                 granted_lock = ldlm_handle2lock(child_lockh);
1581
1582                 LASSERTF(granted_lock != NULL, LPU64"/%lu lockh "LPX64"\n",
1583                          id_fid(&body->id1), (unsigned long)id_group(&body->id1),
1584                          child_lockh->cookie);
1585
1586                 if (name) {
1587                         /* usual named request */
1588                         dparent = mds_id2dentry(obd, &body->id1, NULL);
1589                         LASSERT(!IS_ERR(dparent));
1590                         dchild = ll_lookup_one_len(name, dparent, namesize - 1);
1591                         LASSERT(!IS_ERR(dchild));
1592                 } else {
1593                         /* client wants to get attr. by id */
1594                         dchild = mds_id2dentry(obd, &body->id1, NULL);
1595                         LASSERT(!IS_ERR(dchild));
1596                 }
1597                 LDLM_LOCK_PUT(granted_lock);
1598         }
1599
1600         cleanup_phase = 2; /* dchild, dparent, locks */
1601
1602         if (!DENTRY_VALID(dchild)) {
1603                 intent_set_disposition(rep, DISP_LOOKUP_NEG);
1604                 /*
1605                  * in the intent case, the policy clears this error: the
1606                  * disposition is enough.
1607                  */
1608                 rc = -ENOENT;
1609                 GOTO(cleanup, rc);
1610         } else {
1611                 intent_set_disposition(rep, DISP_LOOKUP_POS);
1612         }
1613
1614         if (req->rq_repmsg == NULL) {
1615                 if (dchild->d_flags & DCACHE_CROSS_REF)
1616                         rc = mds_getattr_pack_msg_cf(req, dchild, offset);
1617                 else
1618                         rc = mds_getattr_pack_msg(req, dchild, offset);
1619                 if (rc != 0) {
1620                         CERROR ("mds_getattr_pack_msg: %d\n", rc);
1621                         GOTO (cleanup, rc);
1622                 }
1623         }
1624
1625         rc = mds_getattr_internal(obd, dchild, req, offset, body,
1626                                   reply_offset, rsd);
1627         if (rc)
1628                 GOTO(cleanup, rc); /* returns the lock to the client */
1629
1630         /* probably MDS knows actual size? */
1631         body = lustre_msg_buf(req->rq_repmsg, reply_offset, sizeof(*body));
1632         LASSERT(body != NULL);
1633         mds_getattr_size(obd, dchild, req, body);
1634
1635         GOTO(cleanup, rc);
1636
1637  cleanup:
1638         switch (cleanup_phase) {
1639         case 2:
1640                 if (resent_req == 0) {
1641                         if (rc && DENTRY_VALID(dchild))
1642                                 ldlm_lock_decref(child_lockh, LCK_PR);
1643                         if (name)
1644                                 ldlm_lock_decref(parent_lockh, LCK_PR);
1645 #ifdef S_PDIROPS
1646                         if (parent_lockh[1].cookie != 0)
1647                                 ldlm_lock_decref(parent_lockh + 1, update_mode);
1648 #endif
1649                         if (dparent)
1650                                 l_dput(dparent);
1651                 }
1652                 l_dput(dchild);
1653         case 1:
1654                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1655         default:
1656                 mds_exit_ucred(&uc);
1657         }
1658         return rc;
1659 }
1660
1661 static int mds_getattr(struct ptlrpc_request *req, int offset)
1662 {
1663         struct obd_device *obd = req->rq_export->exp_obd;
1664         struct lvfs_run_ctxt saved;
1665         struct dentry *de;
1666         struct mds_req_sec_desc *rsd;
1667         struct mds_body *body;
1668         struct lvfs_ucred uc = {NULL, NULL,};
1669         int rc = 0;
1670         ENTRY;
1671
1672         MD_COUNTER_INCREMENT(obd, getattr);
1673
1674         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1675         if (!rsd) {
1676                 CERROR("Can't unpack security desc\n");
1677                 RETURN(-EFAULT);
1678         }
1679
1680         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1681                                   lustre_swab_mds_body);
1682         if (body == NULL) {
1683                 CERROR ("Can't unpack body\n");
1684                 RETURN (-EFAULT);
1685         }
1686
1687         rc = mds_init_ucred(&uc, req, rsd);
1688         if (rc) {
1689                 mds_exit_ucred(&uc);
1690                 RETURN(rc);
1691         }
1692
1693         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1694         de = mds_id2dentry(obd, &body->id1, NULL);
1695         if (IS_ERR(de)) {
1696                 rc = req->rq_status = PTR_ERR(de);
1697                 GOTO(out_pop, rc);
1698         }
1699
1700         rc = mds_getattr_pack_msg(req, de, offset);
1701         if (rc != 0) {
1702                 CERROR("mds_getattr_pack_msg: %d\n", rc);
1703                 GOTO(out_pop, rc);
1704         }
1705
1706         req->rq_status = mds_getattr_internal(obd, de, req, offset, body,
1707                                               0, rsd);
1708         l_dput(de);
1709
1710         EXIT;
1711 out_pop:
1712         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1713         mds_exit_ucred(&uc);
1714         return rc;
1715 }
1716 static int mds_access_check(struct ptlrpc_request *req, int offset)
1717 {
1718         struct obd_device *obd = req->rq_export->exp_obd;
1719         struct lvfs_run_ctxt saved;
1720         struct dentry *de;
1721         struct mds_req_sec_desc *rsd;
1722         struct mds_body *body;
1723         struct lvfs_ucred uc;
1724         int rep_size[2] = {sizeof(*body),
1725                            sizeof(struct mds_remote_perm)};
1726         int rc = 0, rep_offset;
1727         ENTRY;
1728
1729         if (!req->rq_export->exp_mds_data.med_remote) {
1730                 CERROR("from local client "LPU64"\n", req->rq_peer.peer_id.nid);
1731                 RETURN(-EINVAL);
1732         }
1733
1734         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1735         if (!rsd) {
1736                 CERROR("Can't unpack security desc\n");
1737                 RETURN(-EFAULT);
1738         }
1739
1740         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1741                                   lustre_swab_mds_body);
1742         if (body == NULL) {
1743                 CERROR ("Can't unpack body\n");
1744                 RETURN (-EFAULT);
1745         }
1746
1747         MD_COUNTER_INCREMENT(obd, access_check);
1748
1749         rc = mds_init_ucred(&uc, req, rsd);
1750         if (rc) {
1751                 CERROR("init ucred error: %d\n", rc);
1752                 RETURN(rc);
1753         }
1754         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1755
1756         de = mds_id2dentry(obd, &body->id1, NULL);
1757         if (IS_ERR(de)) {
1758                 CERROR("grab ino "LPU64": err %ld\n",
1759                        body->id1.li_stc.u.e3s.l3s_ino, PTR_ERR(de));
1760                 GOTO(out_pop, rc = PTR_ERR(de));
1761         }
1762
1763         rc = lustre_pack_reply(req, 2, rep_size, NULL);
1764         if (rc) {
1765                 CERROR("pack reply error: %d\n", rc);
1766                 GOTO(out_dput, rc = -EINVAL);
1767         }
1768
1769         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
1770         LASSERT(body);
1771
1772         rep_offset = 1;
1773         rc = mds_pack_remote_perm(req, &rep_offset, body, de->d_inode);
1774
1775         EXIT;
1776
1777 out_dput:
1778         l_dput(de);
1779 out_pop:
1780         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1781         mds_exit_ucred(&uc);
1782         return rc;
1783 }
1784
1785 static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
1786                           unsigned long max_age)
1787 {
1788         int rc;
1789         ENTRY;
1790
1791         spin_lock(&obd->obd_osfs_lock);
1792         rc = fsfilt_statfs(obd, obd->u.mds.mds_sb, max_age);
1793         if (rc == 0)
1794                 memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
1795         spin_unlock(&obd->obd_osfs_lock);
1796
1797         RETURN(rc);
1798 }
1799
1800 static int mds_statfs(struct ptlrpc_request *req)
1801 {
1802         struct obd_device *obd = req->rq_export->exp_obd;
1803         int rc, size = sizeof(struct obd_statfs);
1804         ENTRY;
1805
1806         /* This will trigger a watchdog timeout */
1807         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
1808                          (MDS_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1);
1809
1810         rc = lustre_pack_reply(req, 1, &size, NULL);
1811         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_STATFS_PACK)) {
1812                 CERROR("mds: statfs lustre_pack_reply failed: rc = %d\n", rc);
1813                 GOTO(out, rc);
1814         }
1815
1816         OBD_COUNTER_INCREMENT(obd, statfs);
1817
1818         /* We call this so that we can cache a bit - 1 jiffie worth */
1819         rc = mds_obd_statfs(obd, lustre_msg_buf(req->rq_repmsg, 0, size),
1820                             jiffies - HZ);
1821         if (rc) {
1822                 CERROR("mds_obd_statfs failed: rc %d\n", rc);
1823                 GOTO(out, rc);
1824         }
1825
1826         EXIT;
1827 out:
1828         req->rq_status = rc;
1829         return rc;
1830 }
1831
1832 static int mds_sync(struct ptlrpc_request *req, int offset)
1833 {
1834         struct obd_device *obd = req->rq_export->exp_obd;
1835         struct mds_obd *mds = &obd->u.mds;
1836         struct mds_body *body;
1837         int rc, size = sizeof(*body);
1838         ENTRY;
1839
1840         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1841                                   lustre_swab_mds_body);
1842         if (body == NULL)
1843                 GOTO(out, rc = -EPROTO);
1844
1845         rc = lustre_pack_reply(req, 1, &size, NULL);
1846         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_SYNC_PACK)) {
1847                 CERROR("fsync lustre_pack_reply failed: rc = %d\n", rc);
1848                 GOTO(out, rc);
1849         }
1850
1851         if (id_ino(&body->id1) == 0) {
1852                 /* an id of zero is taken to mean "sync whole filesystem" */
1853                 rc = fsfilt_sync(obd, mds->mds_sb);
1854                 if (rc)
1855                         GOTO(out, rc);
1856         } else {
1857                 /* just any file to grab fsync method - "file" arg unused */
1858                 struct file *file = mds->mds_rcvd_filp;
1859                 struct mds_body *rep_body;
1860                 struct dentry *de;
1861
1862                 de = mds_id2dentry(obd, &body->id1, NULL);
1863                 if (IS_ERR(de))
1864                         GOTO(out, rc = PTR_ERR(de));
1865
1866                 rc = file->f_op->fsync(NULL, de, 1);
1867                 if (rc)
1868                         GOTO(out, rc);
1869
1870                 rep_body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep_body));
1871                 mds_pack_inode2body(obd, rep_body, de->d_inode,
1872                                     (body->valid & OBD_MD_FID) ? 1 : 0);
1873                 l_dput(de);
1874         }
1875
1876         EXIT;
1877 out:
1878         req->rq_status = rc;
1879         return rc;
1880 }
1881
1882 /* mds_readpage does not take a DLM lock on the inode, because the client must
1883  * already have a PR lock.
1884  *
1885  * If we were to take another one here, a deadlock will result, if another
1886  * thread is already waiting for a PW lock. */
1887 static int mds_readpage(struct ptlrpc_request *req, int offset)
1888 {
1889         struct obd_device *obd = req->rq_export->exp_obd;
1890         struct vfsmount *mnt;
1891         struct dentry *de;
1892         struct file *file;
1893         struct mds_req_sec_desc *rsd;
1894         struct mds_body *body, *repbody;
1895         struct lvfs_run_ctxt saved;
1896         int rc, size = sizeof(*repbody);
1897         struct lvfs_ucred uc = {NULL, NULL,};
1898         ENTRY;
1899
1900         rc = lustre_pack_reply(req, 1, &size, NULL);
1901         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_READPAGE_PACK)) {
1902                 CERROR("mds: out of memory\n");
1903                 GOTO(out, rc = -ENOMEM);
1904         }
1905
1906         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
1907         if (!rsd) {
1908                 CERROR("Can't unpack security desc\n");
1909                 GOTO (out, rc = -EFAULT);
1910         }
1911
1912         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
1913                                   lustre_swab_mds_body);
1914         if (body == NULL) {
1915                 CERROR("Can't unpack body\n");
1916                 GOTO (out, rc = -EFAULT);
1917         }
1918
1919         rc = mds_init_ucred(&uc, req, rsd);
1920         if (rc) {
1921                 GOTO(out, rc);
1922         }
1923
1924         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1925         de = mds_id2dentry(obd, &body->id1, &mnt);
1926         if (IS_ERR(de))
1927                 GOTO(out_pop, rc = PTR_ERR(de));
1928
1929         CDEBUG(D_INODE, "ino %lu\n", de->d_inode->i_ino);
1930
1931         file = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE);
1932         /* note: in case of an error, dentry_open puts dentry */
1933         if (IS_ERR(file))
1934                 GOTO(out_pop, rc = PTR_ERR(file));
1935
1936         /* body->size is actually the offset -eeb */
1937         if ((body->size & (de->d_inode->i_blksize - 1)) != 0) {
1938                 CERROR("offset "LPU64" not on a block boundary of %lu\n",
1939                        body->size, de->d_inode->i_blksize);
1940                 GOTO(out_file, rc = -EFAULT);
1941         }
1942
1943         /* body->nlink is actually the #bytes to read -eeb */
1944         if (body->nlink & (de->d_inode->i_blksize - 1)) {
1945                 CERROR("size %u is not multiple of blocksize %lu\n",
1946                        body->nlink, de->d_inode->i_blksize);
1947                 GOTO(out_file, rc = -EFAULT);
1948         }
1949
1950         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
1951         repbody->size = file->f_dentry->d_inode->i_size;
1952         repbody->valid = OBD_MD_FLSIZE;
1953
1954         /* to make this asynchronous make sure that the handling function
1955            doesn't send a reply when this function completes. Instead a
1956            callback function would send the reply */
1957         /* body->size is actually the offset -eeb */
1958         rc = mds_sendpage(req, file, body->size, body->nlink);
1959
1960         EXIT;
1961 out_file:
1962         filp_close(file, 0);
1963 out_pop:
1964         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
1965 out:
1966         mds_exit_ucred(&uc);
1967         req->rq_status = rc;
1968         return 0;
1969 }
1970
1971 /* update master MDS ID, which is stored in local inode EA. */
1972 int mds_update_mid(struct obd_device *obd, struct lustre_id *id,
1973                    void *data, int data_len)
1974 {
1975         struct mds_obd *mds = &obd->u.mds;
1976         struct dentry *dentry;
1977         void *handle;
1978         int rc = 0;
1979         ENTRY;
1980
1981         LASSERT(id);
1982         LASSERT(obd);
1983         
1984         dentry = mds_id2dentry(obd, id, NULL);
1985         if (IS_ERR(dentry))
1986                 GOTO(out, rc = PTR_ERR(dentry));
1987
1988         if (!dentry->d_inode) {
1989                 CERROR("Can't find object "DLID4".\n",
1990                        OLID4(id));
1991                 GOTO(out_dentry, rc = -EINVAL);
1992         }
1993
1994         handle = fsfilt_start(obd, dentry->d_inode,
1995                               FSFILT_OP_SETATTR, NULL);
1996         if (IS_ERR(handle))
1997                 GOTO(out_dentry, rc = PTR_ERR(handle));
1998
1999         rc = mds_update_inode_mid(obd, dentry->d_inode, handle,
2000                                   (struct lustre_id *)data);
2001         if (rc) {
2002                 CERROR("Can't update inode "DLID4" master id, "
2003                        "error = %d.\n", OLID4(id), rc);
2004                 GOTO(out_commit, rc);
2005         }
2006
2007         EXIT;
2008 out_commit:
2009         fsfilt_commit(obd, mds->mds_sb, dentry->d_inode,
2010                       handle, 0);
2011 out_dentry:
2012         l_dput(dentry);
2013 out:
2014         return rc;
2015 }
2016 EXPORT_SYMBOL(mds_update_mid);
2017
2018 /* read master MDS ID, which is stored in local inode EA. */
2019 int mds_read_mid(struct obd_device *obd, struct lustre_id *id,
2020                  void *data, int data_len)
2021 {
2022         struct dentry *dentry;
2023         int rc = 0;
2024         ENTRY;
2025
2026         LASSERT(id);
2027         LASSERT(obd);
2028         
2029         dentry = mds_id2dentry(obd, id, NULL);
2030         if (IS_ERR(dentry))
2031                 GOTO(out, rc = PTR_ERR(dentry));
2032
2033         if (!dentry->d_inode) {
2034                 CERROR("Can't find object "DLID4".\n",
2035                        OLID4(id));
2036                 GOTO(out_dentry, rc = -EINVAL);
2037         }
2038
2039         down(&dentry->d_inode->i_sem);
2040         rc = mds_read_inode_mid(obd, dentry->d_inode,
2041                                 (struct lustre_id *)data);
2042         up(&dentry->d_inode->i_sem);
2043         if (rc) {
2044                 CERROR("Can't read inode "DLID4" master id, "
2045                        "error = %d.\n", OLID4(id), rc);
2046                 GOTO(out_dentry, rc);
2047         }
2048
2049         EXIT;
2050 out_dentry:
2051         l_dput(dentry);
2052 out:
2053         return rc;
2054 }
2055 EXPORT_SYMBOL(mds_read_mid);
2056
2057 int mds_read_md(struct obd_device *obd, struct lustre_id *id, 
2058                 char **data, int *datalen)
2059 {
2060         struct dentry *dentry;
2061         struct mds_obd *mds = &obd->u.mds;
2062         int rc = 0, mea = 0;
2063         char *ea;
2064         ENTRY;
2065
2066         LASSERT(id);
2067         LASSERT(obd);
2068         
2069         dentry = mds_id2dentry(obd, id, NULL);
2070         if (IS_ERR(dentry))
2071                 GOTO(out, rc = PTR_ERR(dentry));
2072
2073         if (!dentry->d_inode) {
2074                 CERROR("Can't find object "DLID4".\n",
2075                        OLID4(id));
2076                 GOTO(out_dentry, rc = -EINVAL);
2077         }
2078         if (S_ISDIR(dentry->d_inode->i_mode)) {
2079                 *datalen = obd_packmd(mds->mds_md_exp, NULL, NULL);
2080                 mea = 1; 
2081         } else {
2082                 *datalen = obd_packmd(mds->mds_dt_exp, NULL, NULL); 
2083                 mea = 0;
2084         }
2085         OBD_ALLOC(ea, *datalen);
2086         if (!ea) {
2087                 *datalen = 0;
2088                 GOTO(out_dentry, rc = PTR_ERR(dentry));
2089         } 
2090         *data = ea;
2091         down(&dentry->d_inode->i_sem);
2092         rc = fsfilt_get_md(obd, dentry->d_inode, *data, *datalen,
2093                            (mea ? EA_MEA : EA_LOV));
2094         up(&dentry->d_inode->i_sem);
2095         
2096         if (rc < 0) 
2097                 CERROR("Error %d reading eadata for ino %lu\n",
2098                         rc, dentry->d_inode->i_ino);
2099 out_dentry:
2100         l_dput(dentry);
2101 out:
2102         RETURN(rc);
2103 }
2104 EXPORT_SYMBOL(mds_read_md);
2105
2106 int mds_reint(struct ptlrpc_request *req, int offset,
2107               struct lustre_handle *lockh)
2108 {
2109         struct mds_update_record *rec;
2110         struct mds_req_sec_desc *rsd;
2111         int rc;
2112         ENTRY;
2113
2114         OBD_ALLOC(rec, sizeof(*rec));
2115         if (rec == NULL)
2116                 RETURN(-ENOMEM);
2117
2118         rsd = lustre_swab_mds_secdesc(req, MDS_REQ_SECDESC_OFF);
2119         if (!rsd) {
2120                 CERROR("Can't unpack security desc\n");
2121                 GOTO(out, rc = -EFAULT);
2122         }
2123
2124         rc = mds_update_unpack(req, offset, rec);
2125         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) {
2126                 CERROR("invalid record\n");
2127                 GOTO(out, req->rq_status = -EINVAL);
2128         }
2129
2130         rc = mds_init_ucred(&rec->ur_uc, req, rsd);
2131         if (rc) {
2132                 GOTO(out, rc);
2133         }
2134
2135         /* rc will be used to interrupt a for loop over multiple records */
2136         rc = mds_reint_rec(rec, offset, req, lockh);
2137
2138  out:
2139         mds_exit_ucred(&rec->ur_uc);
2140         OBD_FREE(rec, sizeof(*rec));
2141         RETURN(rc);
2142 }
2143
2144 static int mds_filter_recovery_request(struct ptlrpc_request *req,
2145                                        struct obd_device *obd, int *process)
2146 {
2147         switch (req->rq_reqmsg->opc) {
2148         case MDS_CONNECT: /* This will never get here, but for completeness. */
2149         case OST_CONNECT: /* This will never get here, but for completeness. */
2150         case MDS_DISCONNECT:
2151         case OST_DISCONNECT:
2152                *process = 1;
2153                RETURN(0);
2154
2155         case MDS_CLOSE:
2156         case MDS_SYNC: /* used in unmounting */
2157         case OBD_PING:
2158         case MDS_REINT:
2159         case LDLM_ENQUEUE:
2160         case OST_CREATE:
2161                 *process = target_queue_recovery_request(req, obd);
2162                 RETURN(0);
2163
2164         default:
2165                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
2166                 *process = 0;
2167                 /* XXX what should we set rq_status to here? */
2168                 req->rq_status = -EAGAIN;
2169                 RETURN(ptlrpc_error(req));
2170         }
2171 }
2172
2173 static char *reint_names[] = {
2174         [REINT_SETATTR] "setattr",
2175         [REINT_CREATE]  "create",
2176         [REINT_LINK]    "link",
2177         [REINT_UNLINK]  "unlink",
2178         [REINT_RENAME]  "rename",
2179         [REINT_OPEN]    "open",
2180 };
2181
2182 #define FILTER_VALID_FLAGS (OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLGENER  | \
2183                             OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ| \
2184                             OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME| \
2185                             OBD_MD_FLID) 
2186
2187 static void reconstruct_create(struct ptlrpc_request *req)
2188 {
2189         struct mds_export_data *med = &req->rq_export->exp_mds_data;
2190         struct mds_client_data *mcd = med->med_mcd;
2191         struct dentry *dentry;
2192         struct ost_body *body;
2193         struct lustre_id id;
2194         int rc;
2195         ENTRY;
2196
2197         /* copy rc, transno and disp; steal locks */
2198         mds_req_from_mcd(req, mcd);
2199         if (req->rq_status) {
2200                 EXIT;
2201                 return;
2202         }
2203
2204         id_gen(&id) = 0;
2205         id_group(&id) = 0;
2206
2207         id_ino(&id) = mcd->mcd_last_data;
2208         LASSERT(id_ino(&id) != 0);
2209
2210         dentry = mds_id2dentry(req2obd(req), &id, NULL);
2211         if (IS_ERR(dentry)) {
2212                 CERROR("can't find inode "LPU64"\n", id_ino(&id));
2213                 req->rq_status = PTR_ERR(dentry);
2214                 EXIT;
2215                 return;
2216         }
2217
2218         CWARN("reconstruct reply for x"LPU64" (remote ino) "LPU64" -> %lu/%u\n",
2219               req->rq_xid, id_ino(&id), dentry->d_inode->i_ino,
2220               dentry->d_inode->i_generation);
2221
2222         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
2223         obdo_from_inode(&body->oa, dentry->d_inode, FILTER_VALID_FLAGS);
2224         body->oa.o_id = dentry->d_inode->i_ino;
2225         body->oa.o_generation = dentry->d_inode->i_generation;
2226         body->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER;
2227
2228         down(&dentry->d_inode->i_sem);
2229         rc = mds_read_inode_sid(req2obd(req), dentry->d_inode, &id);
2230         up(&dentry->d_inode->i_sem);
2231         if (rc) {
2232                 CERROR("Can't read inode self id, inode %lu, "
2233                        "rc %d\n", dentry->d_inode->i_ino, rc);
2234                 id_fid(&id) = 0;
2235         }
2236
2237         body->oa.o_fid = id_fid(&id);
2238         body->oa.o_mds = id_group(&id);
2239         l_dput(dentry);
2240
2241         EXIT;
2242 }
2243
2244 static int mds_inode_init_acl(struct obd_device *obd, void *handle,
2245                               struct dentry *de, void *xattr, int xattr_size)
2246 {
2247         struct inode *inode = de->d_inode;
2248         struct posix_acl *acl;
2249         mode_t mode;
2250         int rc = 0;
2251
2252         LASSERT(handle);
2253         LASSERT(inode);
2254         LASSERT(xattr);
2255         LASSERT(xattr_size > 0);
2256
2257         if (!inode->i_op->getxattr || !inode->i_op->setxattr) {
2258                 CERROR("backend fs dosen't support xattr\n");
2259                 return -EOPNOTSUPP;
2260         }
2261
2262         /* set default acl */
2263         if (S_ISDIR(inode->i_mode)) {
2264                 rc = inode->i_op->setxattr(de, XATTR_NAME_ACL_DEFAULT,
2265                                            xattr, xattr_size, 0);
2266                 if (rc) {
2267                         CERROR("set default acl err: %d\n", rc);
2268                         return rc;
2269                 }
2270         }
2271
2272         /* set access acl */
2273         acl = posix_acl_from_xattr(xattr, xattr_size);
2274         if (acl == NULL || IS_ERR(acl)) {
2275                 CERROR("insane attr data\n");
2276                 return PTR_ERR(acl);
2277         }
2278
2279         if (posix_acl_valid(acl)) {
2280                 CERROR("default acl not valid: %d\n", rc);
2281                 rc = -EFAULT;
2282                 goto out;
2283         }
2284
2285         mode = inode->i_mode;
2286         rc = posix_acl_create_masq(acl, &mode);
2287         if (rc < 0) {
2288                 CERROR("create masq err %d\n", rc);
2289                 goto out;
2290         }
2291
2292         if (inode->i_mode != mode) {
2293                 struct iattr iattr = { .ia_valid = ATTR_MODE,
2294                                        .ia_mode = mode };
2295                 int rc2;
2296
2297                 rc2 = fsfilt_setattr(obd, de, handle, &iattr, 0);
2298                 if (rc2) {
2299                         CERROR("setattr mode err: %d\n", rc2);
2300                         rc = rc2;
2301                         goto out;
2302                 }
2303         }
2304
2305         if (rc > 0) {
2306                 /* we didn't change acl except mode bits of some
2307                  * entries, so should be fit into original size.
2308                  */
2309                 rc = posix_acl_to_xattr(acl, xattr, xattr_size);
2310                 LASSERT(rc > 0);
2311
2312                 rc = inode->i_op->setxattr(de, XATTR_NAME_ACL_ACCESS,
2313                                            xattr, xattr_size, 0);
2314                 if (rc)
2315                         CERROR("set access acl err: %d\n", rc);
2316         }
2317 out:
2318         posix_acl_release(acl);
2319         return rc;
2320 }
2321
2322 static int mdt_obj_create(struct ptlrpc_request *req)
2323 {
2324         struct obd_device *obd = req->rq_export->exp_obd;
2325         struct mds_obd *mds = &obd->u.mds;
2326         struct ost_body *body, *repbody;
2327         void *acl = NULL;
2328         int acl_size;
2329         char idname[LL_ID_NAMELEN];
2330         int size = sizeof(*repbody);
2331         struct inode *parent_inode;
2332         struct lvfs_run_ctxt saved;
2333         int rc, cleanup_phase = 0;
2334         struct dentry *new = NULL;
2335         struct dentry_params dp;
2336         int mealen, flags = 0;
2337         struct lvfs_ucred uc;
2338         struct lustre_id id;
2339         struct mea *mea;
2340         void *handle = NULL;
2341         unsigned long cr_inum = 0;
2342         __u64 fid = 0;
2343         ENTRY;
2344        
2345         DEBUG_REQ(D_HA, req, "create remote object");
2346         parent_inode = mds->mds_unnamed_dir->d_inode;
2347
2348         body = lustre_swab_reqbuf(req, 0, sizeof(*body),
2349                                   lustre_swab_ost_body);
2350         if (body == NULL)
2351                 RETURN(-EFAULT);
2352
2353         /* acl data is packed transparently, no swab here */
2354         LASSERT(req->rq_reqmsg->bufcount >= 2);
2355         acl_size = req->rq_reqmsg->buflens[1];
2356         if (acl_size) {
2357                 acl = lustre_msg_buf(req->rq_reqmsg, 1, acl_size);
2358                 if (!acl) {
2359                         CERROR("No default acl buf?\n");
2360                         RETURN(-EFAULT);
2361                 }
2362         }
2363
2364         rc = lustre_pack_reply(req, 1, &size, NULL);
2365         if (rc)
2366                 RETURN(rc);
2367
2368         MDS_CHECK_RESENT(req, reconstruct_create(req));
2369
2370         uc.luc_lsd = NULL;
2371         uc.luc_ginfo = NULL;
2372         uc.luc_uid = body->oa.o_uid;
2373         uc.luc_gid = body->oa.o_gid;
2374         uc.luc_fsuid = body->oa.o_uid;
2375         uc.luc_fsgid = body->oa.o_gid;
2376
2377         push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
2378         repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody));
2379
2380         /* in REPLAY case inum should be given (client or other MDS fills it) */
2381         if (body->oa.o_id && ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2382             (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))) {
2383                 /*
2384                  * this is re-create request from MDS holding directory name.
2385                  * we have to lookup given ino/gen first. if it exists (good
2386                  * case) then there is nothing to do. if it does not then we
2387                  * have to recreate it.
2388                  */
2389                 id_ino(&id) = body->oa.o_id;
2390                 id_gen(&id) = body->oa.o_generation;
2391  
2392                 new = mds_id2dentry(obd, &id, NULL);
2393                 if (!IS_ERR(new) && new->d_inode) {
2394                         struct lustre_id sid;
2395                                 
2396                         CDEBUG(D_OTHER, "mkdir repairing %lu/%lu\n",
2397                                (unsigned long)id_ino(&id),
2398                                (unsigned long)id_gen(&id));
2399                         
2400                         obdo_from_inode(&repbody->oa, new->d_inode,
2401                                         FILTER_VALID_FLAGS);
2402                         
2403                         repbody->oa.o_id = new->d_inode->i_ino;
2404                         repbody->oa.o_generation = new->d_inode->i_generation;
2405                         repbody->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER;
2406                         cleanup_phase = 1;
2407
2408                         down(&new->d_inode->i_sem);
2409                         rc = mds_read_inode_sid(obd, new->d_inode, &sid);
2410                         up(&new->d_inode->i_sem);
2411                         if (rc) {
2412                                 CERROR("Can't read inode self id "
2413                                        "inode %lu, rc %d.\n",
2414                                        new->d_inode->i_ino, rc);
2415                                 GOTO(cleanup, rc);
2416                         }
2417
2418                         repbody->oa.o_fid = id_fid(&sid);
2419                         repbody->oa.o_mds = id_group(&sid);
2420                         LASSERT(id_fid(&sid) != 0);
2421
2422                         /* 
2423                          * here we could use fid passed in body->oa.o_fid and
2424                          * thus avoid mds_read_inode_sid().
2425                          */
2426                         cr_inum = new->d_inode->i_ino;
2427                         GOTO(cleanup, rc = 0);
2428                 }
2429         }
2430         
2431         down(&parent_inode->i_sem);
2432         handle = fsfilt_start(obd, parent_inode, FSFILT_OP_MKDIR, NULL);
2433         if (IS_ERR(handle)) {
2434                 up(&parent_inode->i_sem);
2435                 CERROR("fsfilt_start() failed, rc = %d\n",
2436                        (int)PTR_ERR(handle));
2437                 GOTO(cleanup, rc = PTR_ERR(handle));
2438         }
2439         cleanup_phase = 1; /* transaction */
2440
2441 repeat:
2442         rc = sprintf(idname, "%u.%u", ll_insecure_random_int(), current->pid);
2443         new = lookup_one_len(idname, mds->mds_unnamed_dir, rc);
2444         if (IS_ERR(new)) {
2445                 CERROR("%s: can't lookup new inode (%s) for mkdir: %d\n",
2446                        obd->obd_name, idname, (int) PTR_ERR(new));
2447                 fsfilt_commit(obd, mds->mds_sb, new->d_inode, handle, 0);
2448                 up(&parent_inode->i_sem);
2449                 RETURN(PTR_ERR(new));
2450         } else if (new->d_inode) {
2451                 CERROR("%s: name exists. repeat\n", obd->obd_name);
2452                 goto repeat;
2453         }
2454         if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2455              lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2456                 fid = body->oa.o_fid;
2457         } else { 
2458                 fid = mds_alloc_fid(obd);
2459         }
2460         new->d_fsdata = (void *)&dp;
2461         dp.p_inum = 0;
2462         dp.p_ptr = req;
2463         dp.p_fid = fid;
2464         dp.p_group = mds->mds_num;
2465
2466         if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) ||
2467             (body->oa.o_flags & OBD_FL_RECREATE_OBJS)) {
2468                 LASSERT(body->oa.o_id != 0);
2469                 dp.p_inum = body->oa.o_id;
2470                 DEBUG_REQ(D_HA, req, "replay create obj %lu/%lu",
2471                           (unsigned long)body->oa.o_id,
2472                           (unsigned long)body->oa.o_generation);
2473         }
2474
2475         rc = vfs_mkdir(parent_inode, new, body->oa.o_mode);
2476         if (rc == 0) {
2477                 if (acl) {
2478                         rc = mds_inode_init_acl(obd, handle, new,
2479                                                 acl, acl_size);
2480                         if (rc) {
2481                                 up(&parent_inode->i_sem);
2482                                 GOTO(cleanup, rc);
2483                         }
2484                 }
2485                 if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2486                     lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2487                         new->d_inode->i_generation = body->oa.o_generation;
2488                         mark_inode_dirty(new->d_inode);
2489                         
2490                         /*
2491                          * avoiding asserts in cache flush case, as
2492                          * @body->oa.o_id should be zero.
2493                          */
2494                         if (body->oa.o_id) {
2495                                 LASSERTF(body->oa.o_id == new->d_inode->i_ino, 
2496                                          "BUG 3550: failed to recreate obj "
2497                                          LPU64" -> %lu\n", body->oa.o_id,
2498                                          new->d_inode->i_ino);
2499                                 
2500                                 LASSERTF(body->oa.o_generation == 
2501                                          new->d_inode->i_generation,
2502                                          "BUG 3550: failed to recreate obj/gen "
2503                                          LPU64"/%u -> %lu/%u\n", body->oa.o_id,
2504                                          body->oa.o_generation,
2505                                          new->d_inode->i_ino, 
2506                                          new->d_inode->i_generation);
2507                         }
2508                 }
2509                 
2510                 obdo_from_inode(&repbody->oa, new->d_inode, FILTER_VALID_FLAGS);
2511                 repbody->oa.o_id = new->d_inode->i_ino;
2512                 repbody->oa.o_generation = new->d_inode->i_generation;
2513                 repbody->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FID;
2514
2515                 if ((body->oa.o_flags & OBD_FL_RECREATE_OBJS) ||
2516                     lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2517                         id_group(&id) = mds->mds_num;
2518                 
2519                         LASSERT(body->oa.o_fid != 0);
2520                         id_fid(&id) = body->oa.o_fid;
2521
2522                         LASSERT(body->oa.o_id != 0);
2523                         id_ino(&id) = repbody->oa.o_id;
2524                         id_gen(&id) = repbody->oa.o_generation;
2525                 
2526                         down(&new->d_inode->i_sem);
2527                         rc = mds_update_inode_sid(obd, new->d_inode, handle, &id);
2528                         up(&new->d_inode->i_sem);
2529
2530                         /* 
2531                          * make sure, that fid is up-to-date.
2532                          */
2533                         mds_set_last_fid(obd, id_fid(&id));
2534                 } else {
2535                         /*
2536                          * allocate new sid, as object is created from scratch
2537                          * and this is not replay.
2538                          */
2539                         down(&new->d_inode->i_sem);
2540                         rc = mds_set_inode_sid(obd, new->d_inode, handle, &id, fid);
2541                         up(&new->d_inode->i_sem);
2542                 }
2543                 if (rc) {
2544                         CERROR("Can't update lustre ID for inode %lu, "
2545                                "error = %d\n", new->d_inode->i_ino, rc);
2546                         GOTO(cleanup, rc);
2547                 }
2548
2549                 /* initializing o_fid after it is allocated. */
2550                 repbody->oa.o_fid = id_fid(&id);
2551                 repbody->oa.o_mds = id_group(&id);
2552
2553                 rc = fsfilt_del_dir_entry(obd, new);
2554                 up(&parent_inode->i_sem);
2555                 if (rc) {
2556                         CERROR("can't remove name for object: %d\n", rc);
2557                         GOTO(cleanup, rc);
2558                 }
2559                 
2560                 cleanup_phase = 2; /* created directory object */
2561
2562                 CDEBUG(D_OTHER, "created dirobj: %lu/%lu mode %o\n",
2563                        (unsigned long)new->d_inode->i_ino,
2564                        (unsigned long)new->d_inode->i_generation,
2565                        (unsigned)new->d_inode->i_mode);
2566                 cr_inum = new->d_inode->i_ino;
2567         } else {
2568                 up(&parent_inode->i_sem);
2569                 CERROR("%s: can't create dirobj: %d\n", obd->obd_name, rc);
2570                 GOTO(cleanup, rc);
2571         }
2572
2573         if (body->oa.o_valid & OBD_MD_FLID) {
2574                 /* this is new object for splitted dir. We have to prevent
2575                  * recursive splitting on it -bzzz */
2576                 mealen = obd_size_diskmd(mds->mds_md_exp, NULL);
2577
2578                 OBD_ALLOC(mea, mealen);
2579                 if (mea == NULL)
2580                         GOTO(cleanup, rc = -ENOMEM);
2581
2582                 mea->mea_magic = MEA_MAGIC_ALL_CHARS;
2583                 mea->mea_master = 0;
2584                 mea->mea_count = 0;
2585
2586                 down(&new->d_inode->i_sem);
2587                 rc = fsfilt_set_md(obd, new->d_inode, handle,
2588                                    mea, mealen, EA_MEA);
2589                 up(&new->d_inode->i_sem);
2590                 if (rc)
2591                         CERROR("fsfilt_set_md() failed, "
2592                                "rc = %d\n", rc);
2593
2594                 OBD_FREE(mea, mealen);
2595                 
2596                 CDEBUG(D_OTHER, "%s: mark non-splittable %lu/%u - %d\n",
2597                        obd->obd_name, new->d_inode->i_ino,
2598                        new->d_inode->i_generation, flags);
2599         } else if (body->oa.o_easize) {
2600                 /* we pass LCK_EX to split routine to signal that we have
2601                  * exclusive access to the directory. simple because nobody
2602                  * knows it already exists -bzzz */
2603                 rc = mds_try_to_split_dir(obd, new, NULL,
2604                                           body->oa.o_easize, LCK_EX);
2605                 if (rc < 0) {
2606                         CERROR("Can't split directory %lu, error = %d.\n",
2607                                new->d_inode->i_ino, rc);
2608                 } else {
2609                         rc = 0;
2610                 }
2611         }
2612
2613         EXIT;
2614 cleanup:
2615         switch (cleanup_phase) {
2616         case 2: /* object has been created, but we'll may want to replay it later */
2617                 if (rc == 0)
2618                         ptlrpc_require_repack(req);
2619         case 1: /* transaction */
2620                 rc = mds_finish_transno(mds, parent_inode, handle,
2621                                         req, rc, cr_inum);
2622         }
2623
2624         l_dput(new);
2625         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
2626         return rc;
2627 }
2628
2629 static int mdt_get_info(struct ptlrpc_request *req)
2630 {
2631         struct obd_export *exp = req->rq_export;
2632         int keylen, rc = 0;
2633         char *key;
2634         ENTRY;
2635
2636         key = lustre_msg_buf(req->rq_reqmsg, 0, 1);
2637         if (key == NULL) {
2638                 DEBUG_REQ(D_HA, req, "no get_info key");
2639                 RETURN(-EFAULT);
2640         }
2641         keylen = req->rq_reqmsg->buflens[0];
2642
2643         if ((keylen < strlen("mdsize") || strcmp(key, "mdsize") != 0) &&
2644             (keylen < strlen("mdsnum") || strcmp(key, "mdsnum") != 0) &&
2645             (keylen < strlen("rootid") || strcmp(key, "rootid") != 0))
2646                 RETURN(-EPROTO);
2647
2648         if (keylen >= strlen("rootid") && !strcmp(key, "rootid")) {
2649                 struct lustre_id *reply;
2650                 int size = sizeof(*reply);
2651                 
2652                 rc = lustre_pack_reply(req, 1, &size, NULL);
2653                 if (rc)
2654                         RETURN(rc);
2655
2656                 reply = lustre_msg_buf(req->rq_repmsg, 0, size);
2657                 rc = obd_get_info(exp, keylen, key, (__u32 *)&size, reply);
2658         } else {
2659                 obd_id *reply;
2660                 int size = sizeof(*reply);
2661                 
2662                 rc = lustre_pack_reply(req, 1, &size, NULL);
2663                 if (rc)
2664                         RETURN(rc);
2665
2666                 reply = lustre_msg_buf(req->rq_repmsg, 0, size);
2667                 rc = obd_get_info(exp, keylen, key, (__u32 *)&size, reply);
2668         }
2669
2670         req->rq_repmsg->status = 0;
2671         RETURN(rc);
2672 }
2673
2674 static int mds_set_info(struct obd_export *exp, __u32 keylen,
2675                         void *key, __u32 vallen, void *val)
2676 {
2677         struct obd_device *obd;
2678         struct mds_obd *mds;
2679         int rc = 0;
2680         ENTRY;
2681
2682         obd = class_exp2obd(exp);
2683         if (obd == NULL) {
2684                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
2685                        exp->exp_handle.h_cookie);
2686                 RETURN(-EINVAL);
2687         }
2688
2689         mds = &obd->u.mds;
2690         if (keylen >= strlen("mds_type") &&
2691              memcmp(key, "mds_type", keylen) == 0) {
2692                 int valsize;
2693                 __u32 group;
2694                 
2695                 CDEBUG(D_IOCTL, "set mds type to %x\n", *(int*)val);
2696                 
2697                 mds->mds_obd_type = *(int*)val;
2698                 group = FILTER_GROUP_FIRST_MDS + mds->mds_obd_type;
2699                 valsize = sizeof(group);
2700                 
2701                 /* mds number has been changed, so the corresponding obdfilter
2702                  * exp need to be changed too. */
2703                 rc = obd_set_info(mds->mds_dt_exp, strlen("mds_conn"),
2704                                   "mds_conn", valsize, &group);
2705                 RETURN(rc);
2706         }
2707         if (keylen >= strlen("crypto_type") &&
2708              memcmp(key, "crypto_type", keylen) == 0) {
2709                 rc = mds_set_crypto_type(obd, val, vallen); 
2710                 RETURN(rc);
2711         }
2712
2713         CDEBUG(D_IOCTL, "invalid key\n");
2714         RETURN(-EINVAL);
2715 }
2716
2717 static int mdt_set_info(struct ptlrpc_request *req)
2718 {
2719         char *key, *val;
2720         struct obd_export *exp = req->rq_export;
2721         int keylen, rc = 0, vallen;
2722         ENTRY;
2723
2724         key = lustre_msg_buf(req->rq_reqmsg, 0, 1);
2725         if (key == NULL) {
2726                 DEBUG_REQ(D_HA, req, "no set_info key");
2727                 RETURN(-EFAULT);
2728         }
2729         keylen = req->rq_reqmsg->buflens[0];
2730
2731         if ((keylen == strlen("mds_type") &&
2732             memcmp(key, "mds_type", keylen) == 0) ||
2733             (keylen == strlen("crypto_type") &&
2734             memcmp(key, "crypto_type", keylen) == 0)) {
2735                 rc = lustre_pack_reply(req, 0, NULL, NULL);
2736                 if (rc)
2737                         RETURN(rc);
2738                 
2739                 val = lustre_msg_buf(req->rq_reqmsg, 1, 0);
2740                 vallen = req->rq_reqmsg->buflens[1];
2741
2742                 rc = obd_set_info(exp, keylen, key, vallen, val);
2743                 req->rq_repmsg->status = 0;
2744                 RETURN(rc);
2745         }
2746         CDEBUG(D_IOCTL, "invalid key\n");
2747         RETURN(-EINVAL);
2748 }
2749
2750 static void mds_revoke_export_locks(struct obd_export *exp)
2751 {
2752         struct list_head *locklist = &exp->exp_ldlm_data.led_held_locks;
2753         struct list_head work;
2754         struct ldlm_lock *lock, *next;
2755         struct ldlm_lock_desc desc;
2756
2757         if (!exp->u.eu_mds_data.med_remote)
2758                 return;
2759
2760         ENTRY;
2761         CERROR("implement right locking here! -bzzz\n");
2762         INIT_LIST_HEAD(&work);
2763         spin_lock(&exp->exp_ldlm_data.led_lock);
2764         list_for_each_entry_safe(lock, next, locklist, l_export_chain) {
2765
2766                 lock_res_and_lock(lock);
2767                 if (lock->l_req_mode != lock->l_granted_mode) {
2768                         unlock_res_and_lock(lock);
2769                         continue;
2770                 }
2771
2772                 LASSERT(lock->l_resource);
2773                 if (lock->l_resource->lr_type != LDLM_IBITS &&
2774                     lock->l_resource->lr_type != LDLM_PLAIN) {
2775                         unlock_res_and_lock(lock);
2776                         continue;
2777                 }
2778
2779                 if (lock->l_flags & LDLM_FL_AST_SENT) {
2780                         unlock_res_and_lock(lock);
2781                         continue;
2782                 }
2783
2784                 lock->l_flags |= LDLM_FL_AST_SENT;
2785                 unlock_res_and_lock(lock);
2786
2787                 /* the desc just pretend to exclusive */
2788                 ldlm_lock2desc(lock, &desc);
2789                 desc.l_req_mode = LCK_EX;
2790                 desc.l_granted_mode = 0;
2791
2792                 lock->l_blocking_ast(lock, &desc, NULL, LDLM_CB_BLOCKING);
2793         }
2794         spin_unlock(&exp->exp_ldlm_data.led_lock);
2795
2796         EXIT;
2797 }
2798
2799 static int mds_msg_check_version(struct lustre_msg *msg)
2800 {
2801         int rc;
2802
2803         switch (msg->opc) {
2804         case MDS_CONNECT:
2805         case MDS_DISCONNECT:
2806         case OBD_PING:
2807                 rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
2808                 if (rc)
2809                         CERROR("bad opc %u version %08x, expecting %08x\n",
2810                                msg->opc, msg->version, LUSTRE_OBD_VERSION);
2811                 break;
2812         case MDS_STATFS:
2813         case MDS_GETSTATUS:
2814         case MDS_GETATTR:
2815         case MDS_GETATTR_LOCK:
2816         case MDS_ACCESS_CHECK:
2817         case MDS_READPAGE:
2818         case MDS_REINT:
2819         case MDS_CLOSE:
2820         case MDS_DONE_WRITING:
2821         case MDS_PIN:
2822         case MDS_SYNC:
2823                 rc = lustre_msg_check_version(msg, LUSTRE_MDS_VERSION);
2824                 if (rc)
2825                         CERROR("bad opc %u version %08x, expecting %08x\n",
2826                                msg->opc, msg->version, LUSTRE_MDS_VERSION);
2827                 break;
2828         case LDLM_ENQUEUE:
2829         case LDLM_CONVERT:
2830         case LDLM_BL_CALLBACK:
2831         case LDLM_CP_CALLBACK:
2832                 rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION);
2833                 if (rc)
2834                         CERROR("bad opc %u version %08x, expecting %08x\n",
2835                                msg->opc, msg->version, LUSTRE_DLM_VERSION);
2836                 break;
2837         case OBD_LOG_CANCEL:
2838         case LLOG_ORIGIN_HANDLE_OPEN:
2839         case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
2840         case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
2841         case LLOG_ORIGIN_HANDLE_READ_HEADER:
2842         case LLOG_ORIGIN_HANDLE_CLOSE:
2843         case LLOG_CATINFO:
2844                 rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION);
2845                 if (rc)
2846                         CERROR("bad opc %u version %08x, expecting %08x\n",
2847                                msg->opc, msg->version, LUSTRE_LOG_VERSION);
2848                 break;
2849         case OST_CREATE:
2850         case OST_WRITE:
2851         case OST_GET_INFO:
2852         case OST_SET_INFO:
2853                 rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
2854                 if (rc)
2855                         CERROR("bad opc %u version %08x, expecting %08x\n",
2856                                msg->opc, msg->version, LUSTRE_OBD_VERSION);
2857                 break;
2858         case SEC_INIT:
2859         case SEC_INIT_CONTINUE:
2860         case SEC_FINI:
2861                 rc = 0;
2862                 break;
2863         default:
2864                 CERROR("MDS unknown opcode %d\n", msg->opc);
2865                 rc = -ENOTSUPP;
2866                 break;
2867         }
2868
2869         return rc;
2870 }
2871
2872 int mds_handle(struct ptlrpc_request *req)
2873 {
2874         int should_process, fail = OBD_FAIL_MDS_ALL_REPLY_NET;
2875         struct obd_device *obd = NULL;
2876         struct mds_obd *mds = NULL; /* quell gcc overwarning */
2877         int rc = 0;
2878         ENTRY;
2879
2880         OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
2881
2882         rc = mds_msg_check_version(req->rq_reqmsg);
2883         if (rc) {
2884                 CERROR("MDS drop mal-formed request\n");
2885                 RETURN(rc);
2886         }
2887
2888         /* Security opc should NOT trigger any recovery events */
2889         if (req->rq_reqmsg->opc == SEC_INIT ||
2890             req->rq_reqmsg->opc == SEC_INIT_CONTINUE) {
2891                 if (req->rq_export) {
2892                         mds_req_add_idmapping(req,
2893                                               &req->rq_export->exp_mds_data);
2894                         mds_revoke_export_locks(req->rq_export);
2895                 }
2896                 GOTO(out, rc = 0);
2897         } else if (req->rq_reqmsg->opc == SEC_FINI) {
2898                 if (req->rq_export) {
2899                         mds_req_del_idmapping(req,
2900                                               &req->rq_export->exp_mds_data);
2901                         mds_revoke_export_locks(req->rq_export);
2902                 }
2903                 GOTO(out, rc = 0);
2904         }
2905
2906         LASSERT(current->journal_info == NULL);
2907         /* XXX identical to OST */
2908         if (req->rq_reqmsg->opc != MDS_CONNECT) {
2909                 struct mds_export_data *med;
2910                 int recovering;
2911
2912                 if (req->rq_export == NULL) {
2913                         CERROR("operation %d on unconnected MDS from %s\n",
2914                                req->rq_reqmsg->opc,
2915                                req->rq_peerstr);
2916                         req->rq_status = -ENOTCONN;
2917                         GOTO(out, rc = -ENOTCONN);
2918                 }
2919
2920                 med = &req->rq_export->exp_mds_data;
2921                 obd = req->rq_export->exp_obd;
2922                 mds = &obd->u.mds;
2923
2924                 /* sanity check: if the xid matches, the request must
2925                  * be marked as a resent or replayed */
2926                 if (req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_xid) ||
2927                    req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_close_xid)) {
2928                         LASSERTF(lustre_msg_get_flags(req->rq_reqmsg) &
2929                                  (MSG_RESENT | MSG_REPLAY),
2930                                  "rq_xid "LPU64" matches last_xid, "
2931                                  "expected RESENT flag\n",
2932                                  req->rq_xid);
2933                 }
2934                 /* else: note the opposite is not always true; a
2935                  * RESENT req after a failover will usually not match
2936                  * the last_xid, since it was likely never
2937                  * committed. A REPLAYed request will almost never
2938                  * match the last xid, however it could for a
2939                  * committed, but still retained, open. */
2940
2941                 spin_lock_bh(&obd->obd_processing_task_lock);
2942                 recovering = obd->obd_recovering;
2943                 spin_unlock_bh(&obd->obd_processing_task_lock);
2944                 if (recovering) {
2945                         rc = mds_filter_recovery_request(req, obd,
2946                                                          &should_process);
2947                         if (rc || should_process == 0) {
2948                                 RETURN(rc);
2949                         } else if (should_process < 0) {
2950                                 req->rq_status = should_process;
2951                                 rc = ptlrpc_error(req);
2952                                 RETURN(rc);
2953                         }
2954                 }
2955         }
2956
2957         switch (req->rq_reqmsg->opc) {
2958         case MDS_CONNECT:
2959                 DEBUG_REQ(D_INODE, req, "connect");
2960                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0);
2961                 rc = target_handle_connect(req);
2962                 if (!rc) {
2963                         struct mds_export_data *med;
2964
2965                         LASSERT(req->rq_export);
2966                         med = &req->rq_export->u.eu_mds_data;
2967                         mds_init_export_data(req, med);
2968                         mds_req_add_idmapping(req, med);
2969
2970                         /* Now that we have an export, set mds. */
2971                         obd = req->rq_export->exp_obd;
2972                         mds = mds_req2mds(req);
2973                 }
2974                 break;
2975
2976         case MDS_DISCONNECT:
2977                 DEBUG_REQ(D_INODE, req, "disconnect");
2978                 OBD_FAIL_RETURN(OBD_FAIL_MDS_DISCONNECT_NET, 0);
2979                 rc = target_handle_disconnect(req);
2980                 req->rq_status = rc;            /* superfluous? */
2981                 break;
2982
2983         case MDS_GETSTATUS:
2984                 DEBUG_REQ(D_INODE, req, "getstatus");
2985                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETSTATUS_NET, 0);
2986                 rc = mds_getstatus(req);
2987                 break;
2988
2989         case MDS_GETATTR:
2990                 DEBUG_REQ(D_INODE, req, "getattr");
2991                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NET, 0);
2992                 rc = mds_getattr(req, MDS_REQ_REC_OFF);
2993                 break;
2994
2995         case MDS_ACCESS_CHECK:
2996                 DEBUG_REQ(D_INODE, req, "access_check");
2997                 OBD_FAIL_RETURN(OBD_FAIL_MDS_ACCESS_CHECK_NET, 0);
2998                 rc = mds_access_check(req, MDS_REQ_REC_OFF);
2999                 break;
3000
3001         case MDS_GETATTR_LOCK: {
3002                 struct lustre_handle lockh;
3003                 DEBUG_REQ(D_INODE, req, "getattr_lock");
3004                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_LOCK_NET, 0);
3005
3006                 /* If this request gets a reconstructed reply, we won't be
3007                  * acquiring any new locks in mds_getattr_lock, so we don't
3008                  * want to cancel.
3009                  */
3010                 lockh.cookie = 0;
3011                 rc = mds_getattr_lock(req, MDS_REQ_REC_OFF, &lockh,
3012                                       MDS_INODELOCK_UPDATE);
3013                 /* this non-intent call (from an ioctl) is special */
3014                 req->rq_status = rc;
3015                 if (rc == 0 && lockh.cookie)
3016                         ldlm_lock_decref(&lockh, LCK_PR);
3017                 break;
3018         }
3019         case MDS_STATFS:
3020                 DEBUG_REQ(D_INODE, req, "statfs");
3021                 OBD_FAIL_RETURN(OBD_FAIL_MDS_STATFS_NET, 0);
3022                 rc = mds_statfs(req);
3023                 break;
3024
3025         case MDS_READPAGE:
3026                 DEBUG_REQ(D_INODE, req, "readpage");
3027                 OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
3028                 rc = mds_readpage(req, MDS_REQ_REC_OFF);
3029
3030                 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
3031                         if (req->rq_reply_state) {
3032                                 lustre_free_reply_state (req->rq_reply_state);
3033                                 req->rq_reply_state = NULL;
3034                         }
3035                         RETURN(0);
3036                 }
3037
3038                 break;
3039         case MDS_REINT: {
3040                 __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF,
3041                                              sizeof (*opcp));
3042                 __u32  opc;
3043                 int size[3] = {sizeof(struct mds_body), mds->mds_max_mdsize,
3044                                mds->mds_max_cookiesize};
3045                 int bufcount;
3046
3047                 /* NB only peek inside req now; mds_reint() will swab it */
3048                 if (opcp == NULL) {
3049                         CERROR ("Can't inspect opcode\n");
3050                         rc = -EINVAL;
3051                         break;
3052                 }
3053                 opc = *opcp;
3054                 if (lustre_msg_swabbed (req->rq_reqmsg))
3055                         __swab32s(&opc);
3056
3057                 DEBUG_REQ(D_INODE, req, "reint %d (%s)", opc,
3058                           (opc < sizeof(reint_names) / sizeof(reint_names[0]) ||
3059                            reint_names[opc] == NULL) ? reint_names[opc] :
3060                                                        "unknown opcode");
3061
3062                 OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET, 0);
3063
3064                 if (opc == REINT_UNLINK || opc == REINT_RENAME)
3065                         bufcount = 3;
3066                 else if (opc == REINT_OPEN)
3067                         bufcount = 2;
3068                 else
3069                         bufcount = 1;
3070
3071                 /* for SETATTR: I have different reply setting for
3072                  * remote setfacl, so delay the reply buffer allocation.
3073                  */
3074                 if (opc != REINT_SETATTR) {
3075                         rc = lustre_pack_reply(req, bufcount, size, NULL);
3076                         if (rc)
3077                                 break;
3078                 }
3079
3080                 rc = mds_reint(req, MDS_REQ_REC_OFF, NULL);
3081                 fail = OBD_FAIL_MDS_REINT_NET_REP;
3082                 break;
3083         }
3084
3085         case MDS_CLOSE:
3086                 DEBUG_REQ(D_INODE, req, "close");
3087                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CLOSE_NET, 0);
3088                 rc = mds_close(req, MDS_REQ_REC_OFF);
3089                 break;
3090
3091         case MDS_DONE_WRITING:
3092                 DEBUG_REQ(D_INODE, req, "done_writing");
3093                 OBD_FAIL_RETURN(OBD_FAIL_MDS_DONE_WRITING_NET, 0);
3094                 rc = mds_done_writing(req, MDS_REQ_REC_OFF);
3095                 break;
3096
3097         case MDS_PIN:
3098                 DEBUG_REQ(D_INODE, req, "pin");
3099                 OBD_FAIL_RETURN(OBD_FAIL_MDS_PIN_NET, 0);
3100                 rc = mds_pin(req, MDS_REQ_REC_OFF);
3101                 break;
3102
3103         case MDS_SYNC:
3104                 DEBUG_REQ(D_INODE, req, "sync");
3105                 OBD_FAIL_RETURN(OBD_FAIL_MDS_SYNC_NET, 0);
3106                 rc = mds_sync(req, MDS_REQ_REC_OFF);
3107                 break;
3108
3109         case OBD_PING:
3110                 DEBUG_REQ(D_INODE, req, "ping");
3111                 rc = target_handle_ping(req);
3112                 break;
3113
3114         case OBD_LOG_CANCEL:
3115                 CDEBUG(D_INODE, "log cancel\n");
3116                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
3117                 rc = -ENOTSUPP; /* la la la */
3118                 break;
3119
3120         case LDLM_ENQUEUE:
3121                 DEBUG_REQ(D_INODE, req, "enqueue");
3122                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
3123                 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
3124                                          ldlm_server_blocking_ast, NULL);
3125                 fail = OBD_FAIL_LDLM_REPLY;
3126                 break;
3127         case LDLM_CONVERT:
3128                 DEBUG_REQ(D_INODE, req, "convert");
3129                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CONVERT, 0);
3130                 rc = ldlm_handle_convert(req);
3131                 break;
3132         case LDLM_BL_CALLBACK:
3133         case LDLM_CP_CALLBACK:
3134                 DEBUG_REQ(D_INODE, req, "callback");
3135                 CERROR("callbacks should not happen on MDS\n");
3136                 LBUG();
3137                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0);
3138                 break;
3139         case LLOG_ORIGIN_HANDLE_OPEN:
3140                 DEBUG_REQ(D_INODE, req, "llog_init");
3141                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3142                 rc = llog_origin_handle_open(req);
3143                 break;
3144         case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
3145                 DEBUG_REQ(D_INODE, req, "llog next block");
3146                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3147                 rc = llog_origin_handle_next_block(req);
3148                 break;
3149         case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
3150                 DEBUG_REQ(D_INODE, req, "llog prev block");
3151                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3152                 rc = llog_origin_handle_prev_block(req);
3153                 break;
3154         case LLOG_ORIGIN_HANDLE_READ_HEADER:
3155                 DEBUG_REQ(D_INODE, req, "llog read header");
3156                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3157                 rc = llog_origin_handle_read_header(req);
3158                 break;
3159         case LLOG_ORIGIN_HANDLE_CLOSE:
3160                 DEBUG_REQ(D_INODE, req, "llog close");
3161                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3162                 rc = llog_origin_handle_close(req);
3163                 break;
3164         case OST_CREATE:
3165                 DEBUG_REQ(D_INODE, req, "ost_create");
3166                 rc = mdt_obj_create(req);
3167                 break;
3168         case OST_GET_INFO:
3169                 DEBUG_REQ(D_INODE, req, "get_info");
3170                 rc = mdt_get_info(req);
3171                 break;
3172         case OST_SET_INFO:
3173                 DEBUG_REQ(D_INODE, req, "set_info");
3174                 rc = mdt_set_info(req);
3175                 break;
3176         case OST_WRITE:
3177                 CDEBUG(D_INODE, "write\n");
3178                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
3179                 rc = ost_brw_write(req, NULL);
3180                 LASSERT(current->journal_info == NULL);
3181                 /* mdt_brw sends its own replies */
3182                 RETURN(rc);
3183                 break;
3184         case LLOG_CATINFO:
3185                 DEBUG_REQ(D_INODE, req, "llog catinfo");
3186                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
3187                 rc = llog_catinfo(req);
3188                 break;
3189         default:
3190                 req->rq_status = -ENOTSUPP;
3191                 rc = ptlrpc_error(req);
3192                 RETURN(rc);
3193         }
3194
3195         LASSERT(current->journal_info == NULL);
3196
3197         EXIT;
3198
3199         /* If we're DISCONNECTing, the mds_export_data is already freed */
3200         if (!rc && req->rq_reqmsg->opc != MDS_DISCONNECT) {
3201                 struct mds_export_data *med = &req->rq_export->exp_mds_data;
3202                 struct obd_device *obd = list_entry(mds, struct obd_device,
3203                                                     u.mds);
3204                 req->rq_repmsg->last_xid =
3205                         le64_to_cpu(med->med_mcd->mcd_last_xid);
3206
3207                 if (!obd->obd_no_transno) {
3208                         req->rq_repmsg->last_committed =
3209                                 obd->obd_last_committed;
3210                 } else {
3211                         DEBUG_REQ(D_IOCTL, req,
3212                                   "not sending last_committed update");
3213                 }
3214                 CDEBUG(D_INFO, "last_transno "LPU64", last_committed "LPU64
3215                        ", xid "LPU64"\n",
3216                        mds->mds_last_transno, obd->obd_last_committed,
3217                        req->rq_xid);
3218         }
3219  out:
3220
3221
3222         target_send_reply(req, rc, fail);
3223         return 0;
3224 }
3225
3226 /* Update the server data on disk.  This stores the new mount_count and also the
3227  * last_rcvd value to disk.  If we don't have a clean shutdown, then the server
3228  * last_rcvd value may be less than that of the clients.  This will alert us
3229  * that we may need to do client recovery.
3230  *
3231  * Also assumes for mds_last_transno that we are not modifying it (no locking).
3232  */
3233 int mds_update_server_data(struct obd_device *obd, int force_sync)
3234 {
3235         struct mds_obd *mds = &obd->u.mds;
3236         struct mds_server_data *msd = mds->mds_server_data;
3237         struct file *filp = mds->mds_rcvd_filp;
3238         struct lvfs_run_ctxt saved;
3239         loff_t off = 0;
3240         int rc;
3241         ENTRY;
3242
3243         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3244         msd->msd_last_transno = cpu_to_le64(mds->mds_last_transno);
3245
3246         CDEBUG(D_SUPER, "MDS mount_count is "LPU64", last_transno is "LPU64"\n",
3247                mds->mds_mount_count, mds->mds_last_transno);
3248         rc = fsfilt_write_record(obd, filp, msd, sizeof(*msd), &off, force_sync);
3249         if (rc)
3250                 CERROR("error writing MDS server data: rc = %d\n", rc);
3251         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3252
3253         RETURN(rc);
3254 }
3255
3256 /* saves last allocated fid counter to file. */
3257 int mds_update_last_fid(struct obd_device *obd, void *handle,
3258                         int force_sync)
3259 {
3260         struct mds_obd *mds = &obd->u.mds;
3261         struct file *filp = mds->mds_fid_filp;
3262         struct lvfs_run_ctxt saved;
3263         loff_t off = 0;
3264         __u64 last_fid;
3265         int rc = 0;
3266         ENTRY;
3267
3268         spin_lock(&mds->mds_last_fid_lock);
3269         last_fid = mds->mds_last_fid;
3270         spin_unlock(&mds->mds_last_fid_lock);
3271
3272         CDEBUG(D_SUPER, "MDS last_fid is #"LPU64"\n",
3273                last_fid);
3274
3275         if (handle) {
3276                 fsfilt_add_journal_cb(obd, mds->mds_sb, last_fid,
3277                                       handle, mds_commit_last_fid_cb,
3278                                       NULL);
3279         }
3280                 
3281         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3282         rc = fsfilt_write_record(obd, filp, &last_fid, sizeof(last_fid),
3283                                  &off, force_sync);
3284         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3285
3286         if (rc) {
3287                 CERROR("error writing MDS last_fid #"LPU64
3288                        ", err = %d\n", last_fid, rc);
3289                 RETURN(rc);
3290         }
3291                 
3292         CDEBUG(D_SUPER, "wrote fid #"LPU64" at idx "
3293                "%llu: err = %d\n", last_fid, off, rc);
3294
3295         RETURN(rc);
3296 }
3297
3298 void mds_set_last_fid(struct obd_device *obd, __u64 fid)
3299 {
3300         struct mds_obd *mds = &obd->u.mds;
3301
3302         spin_lock(&mds->mds_last_fid_lock);
3303         if (fid > mds->mds_last_fid)
3304                 mds->mds_last_fid = fid;
3305         spin_unlock(&mds->mds_last_fid_lock);
3306 }
3307
3308 void mds_commit_last_transno_cb(struct obd_device *obd,
3309                                 __u64 transno, void *data,
3310                                 int error)
3311 {
3312         obd_transno_commit_cb(obd, transno, error);
3313 }
3314
3315 void mds_commit_last_fid_cb(struct obd_device *obd,
3316                             __u64 fid, void *data,
3317                             int error)
3318 {
3319         if (error) {
3320                 CERROR("%s: fid "LPD64" commit error: %d\n",
3321                        obd->obd_name, fid, error);
3322                 return;
3323         }
3324         
3325         CDEBUG(D_HA, "%s: fid "LPD64" committed\n",
3326                obd->obd_name, fid);
3327 }
3328
3329 __u64 mds_alloc_fid(struct obd_device *obd)
3330 {
3331         struct mds_obd *mds = &obd->u.mds;
3332         __u64 fid;
3333         
3334         spin_lock(&mds->mds_last_fid_lock);
3335         fid = ++mds->mds_last_fid;
3336         spin_unlock(&mds->mds_last_fid_lock);
3337
3338         return fid;
3339 }
3340 /*
3341  * update new lustre_id on passed @inode and saves it to inode EA.
3342  */
3343 int mds_set_inode_sid(struct obd_device *obd, struct inode *inode,
3344                       void *handle, struct lustre_id *id, __u64 fid)
3345 {
3346         struct mds_obd *mds = &obd->u.mds;
3347         int alloc = 0, rc = 0;
3348         ENTRY;
3349
3350         LASSERT(obd != NULL);
3351         LASSERT(inode != NULL);
3352
3353         if (id == NULL) {
3354                 OBD_ALLOC(id, sizeof(*id));
3355                 if (id == NULL)
3356                         RETURN(-ENOMEM);
3357                 alloc = 1;
3358         }
3359         id_group(id) = mds->mds_num;
3360         id_fid(id) = fid;
3361         id_ino(id) = inode->i_ino;
3362         id_gen(id) = inode->i_generation;
3363         id_type(id) = (S_IFMT & inode->i_mode);
3364
3365         rc = mds_update_inode_sid(obd, inode, handle, id);
3366         if (rc) {
3367                 CERROR("Can't update inode FID EA, "
3368                        "rc = %d\n", rc);
3369         }
3370
3371         if (alloc)
3372                 OBD_FREE(id, sizeof(*id));
3373         RETURN(rc);
3374 }
3375
3376 /*
3377  * reads inode self id from inode EA. Probably later this should be replaced by
3378  * caching inode self id to avoid raeding it every time it is needed.
3379  */
3380 int mds_read_inode_sid(struct obd_device *obd, struct inode *inode,
3381                        struct lustre_id *id)
3382 {
3383         int rc;
3384         ENTRY;
3385
3386         LASSERT(id != NULL);
3387         LASSERT(obd != NULL);
3388         LASSERT(inode != NULL);
3389
3390         rc = fsfilt_get_md(obd, inode, &id->li_fid,
3391                            sizeof(id->li_fid), EA_SID);
3392         if (rc < 0) {
3393                 CERROR("fsfilt_get_md() failed, "
3394                        "rc = %d\n", rc);
3395                 RETURN(rc);
3396         } else if (!rc) {
3397                 rc = -ENODATA;
3398                 RETURN(rc);
3399         } else {
3400                 rc = 0;
3401         }
3402
3403         RETURN(rc);
3404 }
3405
3406 /* updates inode self id in EA. */
3407 int mds_update_inode_sid(struct obd_device *obd, struct inode *inode,
3408                          void *handle, struct lustre_id *id)
3409 {
3410         int rc = 0;
3411         ENTRY;
3412
3413         LASSERT(id != NULL);
3414         LASSERT(obd != NULL);
3415         LASSERT(inode != NULL);
3416         
3417         rc = fsfilt_set_md(obd, inode, handle, &id->li_fid,
3418                            sizeof(id->li_fid), EA_SID);
3419         if (rc) {
3420                 CERROR("fsfilt_set_md() failed, rc = %d\n", rc);
3421                 RETURN(rc);
3422         }
3423
3424         RETURN(rc);
3425 }
3426
3427 /* 
3428  * reads inode id on master MDS. This is usualy done by CMOBD to update requests
3429  * to master MDS by correct store cookie, needed to find inode on master MDS
3430  * quickly.
3431  */
3432 int mds_read_inode_mid(struct obd_device *obd, struct inode *inode,
3433                        struct lustre_id *id)
3434 {
3435         int rc;
3436         ENTRY;
3437
3438         LASSERT(id != NULL);
3439         LASSERT(obd != NULL);
3440         LASSERT(inode != NULL);
3441
3442         rc = fsfilt_get_md(obd, inode, id, sizeof(*id), EA_MID);
3443         if (rc < 0) {
3444                 CERROR("fsfilt_get_md() failed, rc = %d\n", rc);
3445                 RETURN(rc);
3446         } else if (!rc) {
3447                 rc = -ENODATA;
3448                 RETURN(rc);
3449         } else {
3450                 rc = 0;
3451         }
3452
3453         RETURN(rc);
3454 }
3455
3456 /*
3457  * updates master inode id. Usualy this is done by CMOBD after an inode is
3458  * created and relationship between cache MDS and master one should be
3459  * established.
3460  */
3461 int mds_update_inode_mid(struct obd_device *obd, struct inode *inode,
3462                          void *handle, struct lustre_id *id)
3463 {
3464         int rc = 0;
3465         ENTRY;
3466
3467         LASSERT(id != NULL);
3468         LASSERT(obd != NULL);
3469         LASSERT(inode != NULL);
3470         
3471         rc = fsfilt_set_md(obd, inode, handle, id,
3472                            sizeof(*id), EA_MID);
3473         if (rc) {
3474                 CERROR("fsfilt_set_md() failed, "
3475                        "rc = %d\n", rc);
3476                 RETURN(rc);
3477         }
3478
3479         RETURN(rc);
3480 }
3481
3482 /* mount the file system (secretly) */
3483 static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
3484 {
3485         struct lustre_cfg* lcfg = buf;
3486         struct mds_obd *mds = &obd->u.mds;
3487         struct lvfs_obd_ctxt *lvfs_ctxt = NULL;
3488         char *options = NULL;
3489         struct vfsmount *mnt;
3490         char ns_name[48];
3491         unsigned long page;
3492         int rc = 0;
3493         ENTRY;
3494
3495         if (lcfg->lcfg_bufcount < 3)
3496                 RETURN(rc = -EINVAL);
3497
3498         if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0)
3499                 RETURN(rc = -EINVAL);
3500
3501         obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
3502         if (IS_ERR(obd->obd_fsops))
3503                 RETURN(rc = PTR_ERR(obd->obd_fsops));
3504
3505         mds->mds_max_mdsize = sizeof(struct lov_mds_md);
3506
3507         page = get_zeroed_page(GFP_KERNEL);
3508         if (!page)
3509                 RETURN(-ENOMEM);
3510
3511         options = (char *)page;
3512
3513         /*
3514          * here we use "iopen_nopriv" hardcoded, because it affects MDS utility
3515          * and the rest of options are passed by mount options. Probably this
3516          * should be moved to somewhere else like startup scripts or lconf. */
3517         sprintf(options, "iopen_nopriv");
3518         
3519         if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4))
3520                 sprintf(options + strlen(options), ",%s",
3521                         lustre_cfg_string(lcfg, 4));
3522
3523         /* we have to know mdsnum before touching underlying fs -bzzz */
3524         atomic_set(&mds->mds_open_count, 0);
3525         sema_init(&mds->mds_md_sem, 1);
3526         sema_init(&mds->mds_create_sem, 1);
3527         mds->mds_md_connected = 0;
3528         mds->mds_md_name = NULL;
3529
3530         if (LUSTRE_CFG_BUFLEN(lcfg, 5) > 0 && lustre_cfg_buf(lcfg, 5) &&
3531             strncmp(lustre_cfg_string(lcfg, 5), "dumb", LUSTRE_CFG_BUFLEN(lcfg, 5))) {
3532                 class_uuid_t uuid;
3533
3534                 generate_random_uuid(uuid);
3535                 class_uuid_unparse(uuid, &mds->mds_md_uuid);
3536
3537                 OBD_ALLOC(mds->mds_md_name, LUSTRE_CFG_BUFLEN(lcfg, 5));
3538                 if (mds->mds_md_name == NULL) 
3539                         RETURN(rc = -ENOMEM);
3540
3541                 memcpy(mds->mds_md_name, lustre_cfg_buf(lcfg, 5),
3542                        LUSTRE_CFG_BUFLEN(lcfg, 5));
3543                 
3544                 CDEBUG(D_OTHER, "MDS: %s is master for %s\n",
3545                        obd->obd_name, mds->mds_md_name);
3546
3547                 rc = mds_md_connect(obd, mds->mds_md_name);
3548                 if (rc) {
3549                         OBD_FREE(mds->mds_md_name, LUSTRE_CFG_BUFLEN(lcfg, 5));
3550                         GOTO(err_ops, rc);
3551                 }
3552         }
3553
3554         mds->mds_obd_type = MDS_MASTER_OBD;
3555
3556         if (LUSTRE_CFG_BUFLEN(lcfg, 6) > 0 && lustre_cfg_buf(lcfg, 6) &&
3557             strncmp(lustre_cfg_string(lcfg, 6), "dumb", 
3558                     LUSTRE_CFG_BUFLEN(lcfg, 6))) {
3559                 if (!memcmp(lustre_cfg_string(lcfg, 6), "master", 
3560                             strlen("master"))) {
3561                         mds->mds_obd_type = MDS_MASTER_OBD;
3562                 } else if (!memcmp(lustre_cfg_string(lcfg, 6), "cache", 
3563                                    strlen("cache"))) {
3564                         mds->mds_obd_type = MDS_CACHE_OBD;
3565                 }     
3566         }
3567
3568         rc = lvfs_mount_fs(lustre_cfg_string(lcfg, 1), 
3569                            lustre_cfg_string(lcfg, 2),
3570                            options, 0, &lvfs_ctxt);
3571
3572         free_page(page);
3573
3574         if (rc || !lvfs_ctxt) {
3575                 CERROR("lvfs_mount_fs failed: rc = %d\n", rc);
3576                 GOTO(err_ops, rc);
3577         }
3578
3579         mnt = lvfs_ctxt->loc_mnt;
3580         mds->mds_lvfs_ctxt = lvfs_ctxt;
3581         ll_clear_rdonly(ll_sbdev(mnt->mnt_sb));
3582
3583         CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt);
3584
3585         sema_init(&mds->mds_epoch_sem, 1);
3586         atomic_set(&mds->mds_real_clients, 0);
3587         spin_lock_init(&mds->mds_transno_lock);
3588         spin_lock_init(&mds->mds_last_fid_lock);
3589         sema_init(&mds->mds_orphan_recovery_sem, 1);
3590         mds->mds_max_cookiesize = sizeof(struct llog_cookie);
3591
3592         sprintf(ns_name, "mds-%s", obd->obd_uuid.uuid);
3593         obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER);
3594
3595         if (obd->obd_namespace == NULL) {
3596                 mds_cleanup(obd, 0);
3597                 GOTO(err_put, rc = -ENOMEM);
3598         }
3599         ldlm_register_intent(obd->obd_namespace, mds_intent_policy);
3600
3601         rc = mds_fs_setup(obd, mnt);
3602         if (rc) {
3603                 CERROR("%s: MDS filesystem method init failed: rc = %d\n",
3604                        obd->obd_name, rc);
3605                 GOTO(err_ns, rc);
3606         }
3607
3608         rc = llog_start_commit_thread();
3609         if (rc < 0)
3610
3611                 GOTO(err_fs, rc);
3612
3613
3614         if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0 && lustre_cfg_buf(lcfg, 3) &&
3615             strncmp(lustre_cfg_string(lcfg, 3), "dumb", 
3616                     LUSTRE_CFG_BUFLEN(lcfg, 3))) {
3617                 class_uuid_t uuid;
3618
3619                 generate_random_uuid(uuid);
3620                 class_uuid_unparse(uuid, &mds->mds_dt_uuid);
3621
3622                 OBD_ALLOC(mds->mds_profile, LUSTRE_CFG_BUFLEN(lcfg, 3));
3623                 if (mds->mds_profile == NULL)
3624                         GOTO(err_fs, rc = -ENOMEM);
3625
3626                 strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3),
3627                         LUSTRE_CFG_BUFLEN(lcfg, 3));
3628         }
3629
3630         /* 
3631          * setup root dir and files ID dir if lmv already connected, or there is
3632          * not lmv at all.
3633          */
3634         if (mds->mds_md_exp || (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0 && 
3635                                 lustre_cfg_buf(lcfg, 3) &&
3636                                 strncmp(lustre_cfg_string(lcfg, 3), "dumb", 
3637                                         LUSTRE_CFG_BUFLEN(lcfg, 3)))) {
3638                 rc = mds_fs_setup_rootid(obd);
3639                 if (rc)
3640                         GOTO(err_fs, rc);
3641
3642                 rc = mds_fs_setup_virtid(obd);
3643                 if (rc)
3644                         GOTO(err_fs, rc);
3645         }
3646
3647         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
3648                            "mds_ldlm_client", &obd->obd_ldlm_client);
3649         obd->obd_replayable = 1;
3650         
3651         mds->mds_crypto_type = NO_CRYPTO;
3652         
3653         rc = mds_postsetup(obd);
3654         if (rc)
3655                 GOTO(err_fs, rc);
3656
3657         RETURN(0);
3658
3659 err_fs:
3660         /* No extra cleanup needed for llog_init_commit_thread() */
3661         mds_fs_cleanup(obd, 0);
3662 err_ns:
3663         ldlm_namespace_free(obd->obd_namespace, 0);
3664         obd->obd_namespace = NULL;
3665 err_put:
3666         unlock_kernel();
3667         lvfs_umount_fs(mds->mds_lvfs_ctxt);
3668         mds->mds_sb = 0;
3669         lock_kernel();
3670 err_ops:
3671         fsfilt_put_ops(obd->obd_fsops);
3672         return rc;
3673 }
3674
3675 static int mds_fs_post_setup(struct obd_device *obd)
3676 {
3677         struct mds_obd *mds = &obd->u.mds;
3678         struct dentry *dentry;
3679         int rc = 0;
3680         ENTRY;
3681        
3682         dentry = mds_id2dentry(obd, &mds->mds_rootid, NULL);
3683         if (IS_ERR(dentry)) {
3684                 CERROR("Can't find ROOT, err = %d\n",
3685                        (int)PTR_ERR(dentry));
3686                 RETURN(PTR_ERR(dentry));
3687         }
3688         
3689         rc = fsfilt_post_setup(obd, dentry);
3690
3691         l_dput(dentry);
3692         RETURN(rc); 
3693 }
3694
3695 static int mds_postsetup(struct obd_device *obd)
3696 {
3697         struct mds_obd *mds = &obd->u.mds;
3698         int rc = 0;
3699         ENTRY;
3700
3701         rc = obd_llog_setup(obd, &obd->obd_llogs, LLOG_CONFIG_ORIG_CTXT, 
3702                             obd, 0, NULL, &llog_lvfs_ops);
3703         if (rc)
3704                 RETURN(rc);
3705
3706         if (mds->mds_profile) {
3707                 struct llog_ctxt *lgctxt;
3708                 struct lvfs_run_ctxt saved;
3709                 struct lustre_profile *lprof;
3710                 struct config_llog_instance cfg;
3711
3712                 cfg.cfg_instance = NULL;
3713                 cfg.cfg_uuid = mds->mds_dt_uuid;
3714                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3715
3716                 lgctxt = llog_get_context(&obd->obd_llogs, LLOG_CONFIG_ORIG_CTXT);
3717                 if (!lgctxt) {
3718                         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3719                         GOTO(err_llog, rc = -EINVAL);
3720                 }
3721                 
3722                 rc = class_config_process_llog(lgctxt, mds->mds_profile, &cfg);
3723                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3724
3725                 if (rc)
3726                         GOTO(err_llog, rc);
3727
3728                 lprof = class_get_profile(mds->mds_profile);
3729                 if (lprof == NULL) {
3730                         CERROR("No profile found: %s\n", mds->mds_profile);
3731                         GOTO(err_cleanup, rc = -ENOENT);
3732                 }
3733                 rc = mds_dt_connect(obd, lprof->lp_lov);
3734                 if (rc)
3735                         GOTO(err_cleanup, rc);
3736
3737                 rc = mds_md_postsetup(obd);
3738                 if (rc)
3739                         GOTO(err_cleanup, rc);
3740         }
3741         rc = mds_fs_post_setup(obd);
3742         if (rc)
3743                 CERROR("can not post setup fsfilt\n");        
3744         RETURN(rc);
3745 err_cleanup:
3746         mds_dt_clean(obd);
3747 err_llog:
3748         obd_llog_cleanup(llog_get_context(&obd->obd_llogs,
3749                                           LLOG_CONFIG_ORIG_CTXT));
3750         return rc;
3751 }
3752
3753 int mds_postrecov_common(struct obd_device *obd)
3754 {
3755         struct mds_obd *mds = &obd->u.mds;
3756         struct llog_ctxt *ctxt;
3757         int rc, item = 0, valsize;
3758          __u32 group;
3759         ENTRY;
3760
3761         LASSERT(!obd->obd_recovering);
3762         ctxt = llog_get_context(&obd->obd_llogs, LLOG_UNLINK_ORIG_CTXT);
3763         LASSERT(ctxt != NULL);
3764
3765         /* clean PENDING dir */
3766         rc = mds_cleanup_orphans(obd);
3767         if (rc < 0)
3768                 GOTO(out, rc);
3769         item = rc;
3770
3771         group = FILTER_GROUP_FIRST_MDS + mds->mds_num;
3772         valsize = sizeof(group);
3773         rc = obd_set_info(mds->mds_dt_exp, strlen("mds_conn"),
3774                           "mds_conn", valsize, &group);
3775         if (rc)
3776                 GOTO(out, rc);
3777
3778         rc = llog_connect(ctxt, obd->u.mds.mds_dt_desc.ld_tgt_count,
3779                           NULL, NULL, NULL);
3780         if (rc) {
3781                 CERROR("%s: failed at llog_origin_connect: %d\n", 
3782                        obd->obd_name, rc);
3783                 GOTO(out, rc);
3784         }
3785
3786         /* remove the orphaned precreated objects */
3787         rc = mds_dt_clear_orphans(mds, NULL /* all OSTs */);
3788         if (rc)
3789                 GOTO(err_llog, rc);
3790
3791 out:
3792         RETURN(rc < 0 ? rc : item);
3793
3794 err_llog:
3795         /* cleanup all llogging subsystems */
3796         rc = obd_llog_finish(obd, &obd->obd_llogs,
3797                              mds->mds_dt_desc.ld_tgt_count);
3798         if (rc)
3799                 CERROR("%s: failed to cleanup llogging subsystems\n",
3800                         obd->obd_name);
3801         goto out;
3802 }
3803
3804 int mds_postrecov(struct obd_device *obd)
3805 {
3806         int rc;
3807         ENTRY;
3808         rc = mds_postrecov_common(obd);
3809         if (rc == 0)
3810                 rc = mds_md_reconnect(obd);
3811         RETURN(rc);
3812 }
3813
3814 int mds_dt_clean(struct obd_device *obd)
3815 {
3816         struct mds_obd *mds = &obd->u.mds;
3817         ENTRY;
3818
3819         if (mds->mds_profile) {
3820                 char * cln_prof;
3821                 struct llog_ctxt *llctx;
3822                 struct lvfs_run_ctxt saved;
3823                 struct config_llog_instance cfg;
3824                 int len = strlen(mds->mds_profile) + sizeof("-clean") + 1;
3825
3826                 OBD_ALLOC(cln_prof, len);
3827                 sprintf(cln_prof, "%s-clean", mds->mds_profile);
3828
3829                 cfg.cfg_instance = NULL;
3830                 cfg.cfg_uuid = mds->mds_dt_uuid;
3831
3832                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3833                 llctx = llog_get_context(&obd->obd_llogs,
3834                                          LLOG_CONFIG_ORIG_CTXT);
3835                 class_config_process_llog(llctx, cln_prof, &cfg);
3836                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
3837
3838                 OBD_FREE(cln_prof, len);
3839                 OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1);
3840                 mds->mds_profile = NULL;
3841         }
3842         RETURN(0);
3843 }
3844
3845 int mds_md_clean(struct obd_device *obd)
3846 {
3847         struct mds_obd *mds = &obd->u.mds;
3848         ENTRY;
3849
3850         if (mds->mds_md_name) {
3851                 OBD_FREE(mds->mds_md_name, strlen(mds->mds_md_name) + 1);
3852                 mds->mds_md_name = NULL;
3853         }
3854         RETURN(0);
3855 }
3856
3857 static int mds_precleanup(struct obd_device *obd, int flags)
3858 {
3859         int rc = 0;
3860         ENTRY;
3861
3862         mds_md_clean(obd);
3863         mds_dt_disconnect(obd, flags);
3864         mds_dt_clean(obd);
3865         obd_llog_cleanup(llog_get_context(&obd->obd_llogs, LLOG_CONFIG_ORIG_CTXT));
3866         RETURN(rc);
3867 }
3868
3869 extern void lgss_svc_cache_purge_all(void);
3870 static int mds_cleanup(struct obd_device *obd, int flags)
3871 {
3872         struct mds_obd *mds = &obd->u.mds;
3873         ENTRY;
3874
3875         if (mds->mds_sb == NULL)
3876                 RETURN(0);
3877
3878         mds_update_server_data(obd, 1);
3879         mds_update_last_fid(obd, NULL, 1);
3880         
3881         if (mds->mds_dt_objids != NULL) {
3882                 int size = mds->mds_dt_desc.ld_tgt_count *
3883                         sizeof(obd_id);
3884                 OBD_FREE(mds->mds_dt_objids, size);
3885         }
3886         mds_fs_cleanup(obd, flags);
3887
3888         unlock_kernel();
3889
3890         /* 2 seems normal on mds, (may_umount() also expects 2
3891           fwiw), but we only see 1 at this point in obdfilter. */
3892         lvfs_umount_fs(mds->mds_lvfs_ctxt);
3893
3894         mds->mds_sb = 0;
3895
3896         ldlm_namespace_free(obd->obd_namespace, flags & OBD_OPT_FORCE);
3897
3898         spin_lock_bh(&obd->obd_processing_task_lock);
3899         if (obd->obd_recovering) {
3900                 target_cancel_recovery_timer(obd);
3901                 obd->obd_recovering = 0;
3902         }
3903         spin_unlock_bh(&obd->obd_processing_task_lock);
3904
3905         lock_kernel();
3906         fsfilt_put_ops(obd->obd_fsops);
3907
3908 #ifdef ENABLE_GSS
3909         /* XXX */
3910         lgss_svc_cache_purge_all();
3911 #endif
3912
3913         spin_lock(&mds->mds_denylist_lock);
3914         while (!list_empty( &mds->mds_denylist ) ) {
3915                 deny_sec_t *p_deny_sec = list_entry(mds->mds_denylist.next,
3916                                                     deny_sec_t, list);
3917                 list_del(&p_deny_sec->list);
3918                 OBD_FREE(p_deny_sec, sizeof(*p_deny_sec));
3919         }
3920         spin_unlock(&mds->mds_denylist_lock);
3921
3922         RETURN(0);
3923 }
3924
3925 static int set_security(const char *value, char **sec)
3926 {
3927         if (!strcmp(value, "null"))
3928                 *sec = "null";
3929         else if (!strcmp(value, "krb5i"))
3930                 *sec = "krb5i";
3931         else if (!strcmp(value, "krb5p"))
3932                 *sec = "krb5p";
3933         else {
3934                 CERROR("Unrecognized security flavor %s\n", value);
3935                 return -EINVAL;
3936         }
3937
3938         return 0;
3939 }
3940
3941 static int mds_process_config(struct obd_device *obd, obd_count len, void *buf)
3942 {
3943         struct lustre_cfg *lcfg = buf;
3944         struct mds_obd *mds = &obd->u.mds;
3945         int rc = 0;
3946         ENTRY;
3947
3948         switch(lcfg->lcfg_command) {
3949         case LCFG_SET_SECURITY: {
3950                 if ((LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) ||
3951                     (LUSTRE_CFG_BUFLEN(lcfg, 2) == 0))
3952                         GOTO(out, rc = -EINVAL);
3953
3954                 if (!strcmp(lustre_cfg_string(lcfg, 1), "mds_sec"))
3955                         rc = set_security(lustre_cfg_string(lcfg, 2),
3956                                           &mds->mds_mds_sec);
3957                 else if (!strcmp(lustre_cfg_string(lcfg, 1), "oss_sec"))
3958                         rc = set_security(lustre_cfg_string(lcfg, 2),
3959                                           &mds->mds_ost_sec);
3960                 else if (!strcmp(lustre_cfg_string(lcfg, 1), "deny_sec")){
3961                         spin_lock(&mds->mds_denylist_lock);
3962                         rc = add_deny_security(lustre_cfg_string(lcfg, 2),
3963                                                &mds->mds_denylist);
3964                         spin_unlock(&mds->mds_denylist_lock);
3965                 } else {
3966                         CERROR("Unrecognized key\n");
3967                         rc = -EINVAL;
3968                 }
3969                 break;
3970         }
3971         default:
3972                 CERROR("Unknown command: %d\n", lcfg->lcfg_command);
3973                 GOTO(out, rc = -EINVAL);
3974         }
3975 out:
3976         RETURN(rc);
3977 }
3978
3979 static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
3980                                         int offset,
3981                                         struct ldlm_lock *new_lock,
3982                                         struct ldlm_lock **old_lock,
3983                                         struct lustre_handle *lockh)
3984 {
3985         struct obd_export *exp = req->rq_export;
3986         struct obd_device *obd = exp->exp_obd;
3987         struct ldlm_request *dlmreq =
3988                 lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*dlmreq));
3989         struct lustre_handle remote_hdl = dlmreq->lock_handle1;
3990         struct list_head *iter;
3991
3992         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
3993                 return;
3994
3995         spin_lock(&obd->obd_namespace->ns_hash_lock);
3996         list_for_each(iter, &exp->exp_ldlm_data.led_held_locks) {
3997                 struct ldlm_lock *lock;
3998                 lock = list_entry(iter, struct ldlm_lock, l_export_chain);
3999                 if (lock == new_lock)
4000                         continue;
4001                 if (lock->l_remote_handle.cookie == remote_hdl.cookie) {
4002                         lockh->cookie = lock->l_handle.h_cookie;
4003                         LDLM_DEBUG(lock, "restoring lock cookie");
4004                         DEBUG_REQ(D_HA, req, "restoring lock cookie "LPX64,
4005                                   lockh->cookie);
4006                         if (old_lock)
4007                                 *old_lock = LDLM_LOCK_GET(lock);
4008                         spin_unlock(&obd->obd_namespace->ns_hash_lock);
4009                         return;
4010                 }
4011         }
4012         spin_unlock(&obd->obd_namespace->ns_hash_lock);
4013
4014         /* If the xid matches, then we know this is a resent request,
4015          * and allow it. (It's probably an OPEN, for which we don't
4016          * send a lock */
4017         if (req->rq_xid == 
4018             le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_xid))
4019                 return;
4020
4021         if (req->rq_xid == 
4022             le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_close_xid))
4023                 return;
4024
4025         /* This remote handle isn't enqueued, so we never received or
4026          * processed this request.  Clear MSG_RESENT, because it can
4027          * be handled like any normal request now. */
4028
4029         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
4030
4031         DEBUG_REQ(D_HA, req, "no existing lock with rhandle "LPX64,
4032                   remote_hdl.cookie);
4033 }
4034
4035 int intent_disposition(struct ldlm_reply *rep, int flag)
4036 {
4037         if (!rep)
4038                 return 0;
4039         return (rep->lock_policy_res1 & flag);
4040 }
4041
4042 void intent_set_disposition(struct ldlm_reply *rep, int flag)
4043 {
4044         if (!rep)
4045                 return;
4046         rep->lock_policy_res1 |= flag;
4047 }
4048
4049 static int mds_intent_prepare_reply_buffers(struct ptlrpc_request *req, 
4050                                             struct ldlm_intent *it)
4051 {
4052         struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
4053         int rc, reply_buffers;
4054         int repsize[5] = {sizeof(struct ldlm_reply),
4055                           sizeof(struct mds_body),
4056                           mds->mds_max_mdsize};
4057         ENTRY;       
4058  
4059         reply_buffers = 3;
4060         if (it->opc & ( IT_OPEN | IT_GETATTR | IT_LOOKUP | IT_CHDIR )) {
4061                 if (req->rq_export->exp_mds_data.med_remote) {
4062                         repsize[reply_buffers++] = 
4063                                 sizeof(struct mds_remote_perm);
4064                 } else {
4065                         repsize[reply_buffers++] = sizeof(int);
4066                         repsize[reply_buffers++] = 
4067                                 xattr_acl_size(LL_ACL_MAX_ENTRIES);
4068                 }
4069                 /*FIXME: ugly here, should be optimize for there 
4070                  * is no crypto key*/
4071                 repsize[reply_buffers++] = sizeof(int);
4072                 repsize[reply_buffers++] = sizeof(struct crypto_key); 
4073         }
4074
4075         rc = lustre_pack_reply(req, reply_buffers, repsize, NULL);
4076
4077         RETURN(rc);
4078 }
4079
4080 static int mds_intent_policy(struct ldlm_namespace *ns,
4081                              struct ldlm_lock **lockp, void *req_cookie,
4082                              ldlm_mode_t mode, int flags, void *data)
4083 {
4084         struct ptlrpc_request *req = req_cookie;
4085         struct ldlm_lock *lock = *lockp;
4086         struct ldlm_intent *it;
4087         struct ldlm_reply *rep;
4088         struct lustre_handle lockh[2] = {{0}, {0}};
4089         struct ldlm_lock *new_lock = NULL;
4090         int getattr_part = MDS_INODELOCK_UPDATE;
4091         int rc;
4092
4093         int offset = MDS_REQ_INTENT_REC_OFF; 
4094         ENTRY;
4095
4096         LASSERT(req != NULL);
4097         MD_COUNTER_INCREMENT(req->rq_export->exp_obd, intent_lock);
4098
4099         if (req->rq_reqmsg->bufcount <= MDS_REQ_INTENT_IT_OFF) {
4100                 /* No intent was provided */
4101                 int size = sizeof(struct ldlm_reply);
4102                 rc = lustre_pack_reply(req, 1, &size, NULL);
4103                 LASSERT(rc == 0);
4104                 RETURN(0);
4105         }
4106
4107         it = lustre_swab_reqbuf(req, MDS_REQ_INTENT_IT_OFF, sizeof(*it),
4108                                 lustre_swab_ldlm_intent);
4109         if (it == NULL) {
4110                 CERROR("Intent missing\n");
4111                 RETURN(req->rq_status = -EFAULT);
4112         }
4113
4114         LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc));
4115
4116         rc = mds_intent_prepare_reply_buffers(req, it);
4117
4118         if (rc)
4119                 RETURN(req->rq_status = rc);
4120
4121         rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep));
4122         LASSERT(rep != NULL);
4123
4124         intent_set_disposition(rep, DISP_IT_EXECD);
4125
4126         /* execute policy */
4127         switch ((long)it->opc) {
4128         case IT_OPEN:
4129         case IT_CREAT|IT_OPEN:
4130                 fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF,
4131                                             lock, NULL, lockh);
4132                 /* XXX swab here to assert that an mds_open reint
4133                  * packet is following */
4134                 fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF, 
4135                                             lock, NULL, lockh);
4136                 rep->lock_policy_res2 = mds_reint(req, offset, lockh);
4137
4138                 if (rep->lock_policy_res2) {
4139                         /* 
4140                          * mds_open() returns ENOLCK where it should return
4141                          * zero, but it has no lock to return.
4142                          */
4143                         if (rep->lock_policy_res2 == ENOLCK)
4144                                 rep->lock_policy_res2 = 0;
4145
4146                         RETURN(ELDLM_LOCK_ABORTED);
4147                 }
4148                 
4149                 /*
4150                  * IT_OPEN may return lock on cross-node dentry that we want to
4151                  * hold during attr retrival -bzzz
4152                  */
4153                 if (lockh[0].cookie == 0)
4154                         RETURN(ELDLM_LOCK_ABORTED);
4155                 
4156                 break;
4157         case IT_LOOKUP:
4158                 getattr_part = MDS_INODELOCK_LOOKUP;
4159         case IT_CHDIR:
4160         case IT_GETATTR:
4161                 getattr_part |= MDS_INODELOCK_LOOKUP;
4162         case IT_READDIR:
4163                 fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF, 
4164                                             lock, &new_lock, lockh);
4165                 rep->lock_policy_res2 = mds_getattr_lock(req, offset, lockh,
4166                                                          getattr_part);
4167                 /* FIXME: LDLM can set req->rq_status. MDS sets
4168                    policy_res{1,2} with disposition and status.
4169                    - replay: returns 0 & req->status is old status
4170                    - otherwise: returns req->status */
4171                 if (intent_disposition(rep, DISP_LOOKUP_NEG))
4172                         rep->lock_policy_res2 = 0;
4173                 if (!intent_disposition(rep, DISP_LOOKUP_POS) ||
4174                     rep->lock_policy_res2)
4175                         RETURN(ELDLM_LOCK_ABORTED);
4176                 if (req->rq_status != 0) {
4177                         LBUG();
4178                         rep->lock_policy_res2 = req->rq_status;
4179                         RETURN(ELDLM_LOCK_ABORTED);
4180                 }
4181                 break;
4182         case IT_UNLINK:
4183                 rc = mds_lock_and_check_slave(offset, req, lockh);
4184                 if ((rep->lock_policy_res2 = rc)) {
4185                         if (rc == ENOLCK)
4186                                 rep->lock_policy_res2 = 0;
4187                         RETURN(ELDLM_LOCK_ABORTED);
4188                 }
4189                 break;
4190         default:
4191                 CERROR("Unhandled intent "LPD64"\n", it->opc);
4192                 LBUG();
4193         }
4194
4195         /* By this point, whatever function we called above must have either
4196          * filled in 'lockh', been an intent replay, or returned an error.  We
4197          * want to allow replayed RPCs to not get a lock, since we would just
4198          * drop it below anyways because lock replay is done separately by the
4199          * client afterwards.  For regular RPCs we want to give the new lock to
4200          * the client instead of whatever lock it was about to get. */
4201         if (new_lock == NULL)
4202                 new_lock = ldlm_handle2lock(&lockh[0]);
4203         if (new_lock == NULL && (flags & LDLM_FL_INTENT_ONLY))
4204                 RETURN(0);
4205
4206         LASSERTF(new_lock != NULL, "op "LPX64" lockh "LPX64"\n",
4207                  it->opc, lockh[0].cookie);
4208
4209         /* If we've already given this lock to a client once, then we should
4210          * have no readers or writers.  Otherwise, we should have one reader
4211          * _or_ writer ref (which will be zeroed below) before returning the
4212          * lock to a client. */
4213         if (new_lock->l_export == req->rq_export) {
4214                 LASSERT(new_lock->l_readers + new_lock->l_writers == 0);
4215         } else {
4216                 LASSERT(new_lock->l_export == NULL);
4217                 LASSERT(new_lock->l_readers + new_lock->l_writers == 1);
4218         }
4219
4220         *lockp = new_lock;
4221
4222         if (new_lock->l_export == req->rq_export) {
4223                 /* Already gave this to the client, which means that we
4224                  * reconstructed a reply. */
4225                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) &
4226                         MSG_RESENT);
4227                 RETURN(ELDLM_LOCK_REPLACED);
4228         }
4229
4230         /* Fixup the lock to be given to the client */
4231         lock_res_and_lock(new_lock);
4232         new_lock->l_readers = 0;
4233         new_lock->l_writers = 0;
4234
4235         new_lock->l_export = class_export_get(req->rq_export);
4236
4237         spin_lock(&new_lock->l_export->exp_ldlm_data.led_lock);
4238         list_add(&new_lock->l_export_chain,
4239                  &new_lock->l_export->exp_ldlm_data.led_held_locks);
4240         spin_unlock(&new_lock->l_export->exp_ldlm_data.led_lock);
4241
4242         new_lock->l_blocking_ast = lock->l_blocking_ast;
4243         new_lock->l_completion_ast = lock->l_completion_ast;
4244
4245         memcpy(&new_lock->l_remote_handle, &lock->l_remote_handle,
4246                sizeof(lock->l_remote_handle));
4247
4248         new_lock->l_flags &= ~LDLM_FL_LOCAL;
4249
4250         unlock_res_and_lock(new_lock);
4251         LDLM_LOCK_PUT(new_lock);
4252
4253         RETURN(ELDLM_LOCK_REPLACED);
4254 }
4255
4256 int mds_attach(struct obd_device *dev, obd_count len, void *data)
4257 {
4258         struct lprocfs_static_vars lvars;
4259         int rc = 0;
4260         struct mds_obd *mds = &dev->u.mds;
4261
4262         spin_lock_init(&mds->mds_denylist_lock);
4263         INIT_LIST_HEAD(&mds->mds_denylist);
4264
4265         lprocfs_init_multi_vars(0, &lvars);
4266
4267         rc = lprocfs_obd_attach(dev, lvars.obd_vars);
4268         if (rc)
4269                 return rc;
4270
4271         return lprocfs_alloc_md_stats(dev, 0);
4272 }
4273
4274 int mds_detach(struct obd_device *dev)
4275 {
4276         lprocfs_free_md_stats(dev);
4277         return lprocfs_obd_detach(dev);
4278 }
4279
4280 int mdt_attach(struct obd_device *dev, obd_count len, void *data)
4281 {
4282         struct lprocfs_static_vars lvars;
4283
4284         lprocfs_init_multi_vars(1, &lvars);
4285         return lprocfs_obd_attach(dev, lvars.obd_vars);
4286 }
4287
4288 int mdt_detach(struct obd_device *dev)
4289 {
4290         return lprocfs_obd_detach(dev);
4291 }
4292
4293 static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
4294 {
4295         struct mds_obd *mds = &obd->u.mds;
4296         int rc = 0;
4297         ENTRY;
4298
4299         mds->mds_service =
4300                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
4301                                 MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
4302                                 MDS_SERVICE_WATCHDOG_TIMEOUT,
4303                                 mds_handle, "mds", obd->obd_proc_entry);
4304
4305         if (!mds->mds_service) {
4306                 CERROR("failed to start service\n");
4307                 RETURN(-ENOMEM);
4308         }
4309
4310         rc = ptlrpc_start_n_threads(obd, mds->mds_service, MDT_NUM_THREADS,
4311                                     "ll_mdt");
4312         if (rc)
4313                 GOTO(err_thread, rc);
4314
4315         mds->mds_setattr_service =
4316                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
4317                                 MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL,
4318                                 MDS_SERVICE_WATCHDOG_TIMEOUT,
4319                                 mds_handle, "mds_setattr",
4320                                 obd->obd_proc_entry);
4321         if (!mds->mds_setattr_service) {
4322                 CERROR("failed to start getattr service\n");
4323                 GOTO(err_thread, rc = -ENOMEM);
4324         }
4325
4326         rc = ptlrpc_start_n_threads(obd, mds->mds_setattr_service,
4327                                     MDT_NUM_THREADS, "ll_mdt_attr");
4328         if (rc)
4329                 GOTO(err_thread2, rc);
4330
4331         mds->mds_readpage_service =
4332                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
4333                                 MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL,
4334                                 MDS_SERVICE_WATCHDOG_TIMEOUT,
4335                                 mds_handle, "mds_readpage",
4336                                 obd->obd_proc_entry);
4337         if (!mds->mds_readpage_service) {
4338                 CERROR("failed to start readpage service\n");
4339                 GOTO(err_thread2, rc = -ENOMEM);
4340         }
4341
4342         rc = ptlrpc_start_n_threads(obd, mds->mds_readpage_service,
4343                                     MDT_NUM_THREADS, "ll_mdt_rdpg");
4344
4345         if (rc)
4346                 GOTO(err_thread3, rc);
4347
4348         RETURN(0);
4349
4350 err_thread3:
4351         ptlrpc_unregister_service(mds->mds_readpage_service);
4352 err_thread2:
4353         ptlrpc_unregister_service(mds->mds_setattr_service);
4354 err_thread:
4355         ptlrpc_unregister_service(mds->mds_service);
4356         return rc;
4357 }
4358
4359 static int mdt_cleanup(struct obd_device *obd, int flags)
4360 {
4361         struct mds_obd *mds = &obd->u.mds;
4362         ENTRY;
4363
4364         ptlrpc_stop_all_threads(mds->mds_readpage_service);
4365         ptlrpc_unregister_service(mds->mds_readpage_service);
4366
4367         ptlrpc_stop_all_threads(mds->mds_setattr_service);
4368         ptlrpc_unregister_service(mds->mds_setattr_service);
4369
4370         ptlrpc_stop_all_threads(mds->mds_service);
4371         ptlrpc_unregister_service(mds->mds_service);
4372
4373         RETURN(0);
4374 }
4375
4376 static struct dentry *mds_lvfs_id2dentry(__u64 ino, __u32 gen,
4377                                          __u64 gr, void *data)
4378 {
4379         struct lustre_id id;
4380         struct obd_device *obd = data;
4381         
4382         id_ino(&id) = ino;
4383         id_gen(&id) = gen;
4384         return mds_id2dentry(obd, &id, NULL);
4385 }
4386
4387 static int mds_get_info(struct obd_export *exp, __u32 keylen,
4388                         void *key, __u32 *valsize, void *val)
4389 {
4390         struct obd_device *obd;
4391         struct mds_obd *mds;
4392         ENTRY;
4393
4394         obd = class_exp2obd(exp);
4395         mds = &obd->u.mds;
4396         
4397         if (obd == NULL) {
4398                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
4399                        exp->exp_handle.h_cookie);
4400                 RETURN(-EINVAL);
4401         }
4402
4403         if (keylen >= strlen("reint_log") && memcmp(key, "reint_log", 9) == 0) {
4404                 /* get log_context handle. */
4405                 unsigned long *llh_handle = val;
4406                 *valsize = sizeof(unsigned long);
4407                 *llh_handle = (unsigned long)obd->obd_llog_ctxt[LLOG_REINT_ORIG_CTXT];
4408                 RETURN(0);
4409         }
4410         if (keylen >= strlen("cache_sb") && memcmp(key, "cache_sb", 8) == 0) {
4411                 /* get log_context handle. */
4412                 unsigned long *sb = val;
4413                 *valsize = sizeof(unsigned long);
4414                 *sb = (unsigned long)obd->u.mds.mds_sb;
4415                 RETURN(0);
4416         }
4417
4418         if (keylen >= strlen("mdsize") && memcmp(key, "mdsize", keylen) == 0) {
4419                 __u32 *mdsize = val;
4420                 *valsize = sizeof(*mdsize);
4421                 *mdsize = mds->mds_max_mdsize;
4422                 RETURN(0);
4423         }
4424
4425         if (keylen >= strlen("mdsnum") && strcmp(key, "mdsnum") == 0) {
4426                 __u32 *mdsnum = val;
4427                 *valsize = sizeof(*mdsnum);
4428                 *mdsnum = mds->mds_num;
4429                 RETURN(0);
4430         }
4431
4432         if (keylen >= strlen("rootid") && strcmp(key, "rootid") == 0) {
4433                 struct lustre_id *rootid = val;
4434                 *valsize = sizeof(*rootid);
4435                 *rootid = mds->mds_rootid;
4436                 RETURN(0);
4437         }
4438
4439         if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) {
4440                 struct lov_desc *desc = val;
4441                 *valsize = sizeof(*desc);
4442                 *desc = mds->mds_dt_desc;
4443                 RETURN(0);
4444         }
4445
4446         CDEBUG(D_IOCTL, "invalid key\n");
4447         RETURN(-EINVAL);
4448
4449 }
4450 struct lvfs_callback_ops mds_lvfs_ops = {
4451         l_id2dentry:     mds_lvfs_id2dentry,
4452 };
4453
4454 int mds_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
4455                 int objcount, struct obd_ioobj *obj,
4456                 int niocount, struct niobuf_remote *nb,
4457                 struct niobuf_local *res,
4458                 struct obd_trans_info *oti);
4459
4460 int mds_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
4461                  int objcount, struct obd_ioobj *obj, int niocount,
4462                  struct niobuf_local *res, struct obd_trans_info *oti,
4463                  int rc);
4464
4465 /* use obd ops to offer management infrastructure */
4466 static struct obd_ops mds_obd_ops = {
4467         .o_owner           = THIS_MODULE,
4468         .o_attach          = mds_attach,
4469         .o_detach          = mds_detach,
4470         .o_connect         = mds_connect,
4471         .o_connect_post    = mds_connect_post,
4472         .o_init_export     = mds_init_export,
4473         .o_destroy_export  = mds_destroy_export,
4474         .o_disconnect      = mds_disconnect,
4475         .o_setup           = mds_setup,
4476         .o_precleanup      = mds_precleanup,
4477         .o_cleanup         = mds_cleanup,
4478         .o_process_config  = mds_process_config,
4479         .o_postrecov       = mds_postrecov,
4480         .o_statfs          = mds_obd_statfs,
4481         .o_iocontrol       = mds_iocontrol,
4482         .o_create          = mds_obd_create,
4483         .o_destroy         = mds_obd_destroy,
4484         .o_llog_init       = mds_llog_init,
4485         .o_llog_finish     = mds_llog_finish,
4486         .o_notify          = mds_notify,
4487         .o_get_info        = mds_get_info,
4488         .o_set_info        = mds_set_info,
4489         .o_preprw          = mds_preprw, 
4490         .o_commitrw        = mds_commitrw,
4491 };
4492
4493 static struct obd_ops mdt_obd_ops = {
4494         .o_owner           = THIS_MODULE,
4495         .o_attach          = mdt_attach,
4496         .o_detach          = mdt_detach,
4497         .o_setup           = mdt_setup,
4498         .o_cleanup         = mdt_cleanup,
4499 };
4500
4501 static int __init mds_init(void)
4502 {
4503         struct lprocfs_static_vars lvars;
4504
4505         mds_init_lsd_cache();
4506         mds_init_rmtacl_upcall_cache();
4507
4508         lprocfs_init_multi_vars(0, &lvars);
4509         class_register_type(&mds_obd_ops, NULL, lvars.module_vars,
4510                             OBD_MDS_DEVICENAME);
4511         lprocfs_init_multi_vars(1, &lvars);
4512         class_register_type(&mdt_obd_ops, NULL, lvars.module_vars,
4513                             OBD_MDT_DEVICENAME);
4514
4515         return 0;
4516 }
4517
4518 static void /*__exit*/ mds_exit(void)
4519 {
4520         mds_cleanup_rmtacl_upcall_cache();
4521         mds_cleanup_lsd_cache();
4522
4523         class_unregister_type(OBD_MDS_DEVICENAME);
4524         class_unregister_type(OBD_MDT_DEVICENAME);
4525 }
4526
4527 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
4528 MODULE_DESCRIPTION("Lustre Metadata Server (MDS)");
4529 MODULE_LICENSE("GPL");
4530
4531 module_init(mds_init);
4532 module_exit(mds_exit);