Whamcloud - gitweb
LU-3494 libcfs: Add relocation function to libcfs heap
[fs/lustre-release.git] / lustre / mdt / mdt_open.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2013, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/mdt/mdt_open.c
37  *
38  * Lustre Metadata Target (mdt) open/close file handling
39  *
40  * Author: Huang Hua <huanghua@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_MDS
44
45 #include <lustre_acl.h>
46 #include <lustre_mds.h>
47 #include "mdt_internal.h"
48
49 /* we do nothing because we do not have refcount now */
50 static void mdt_mfd_get(void *mfdp)
51 {
52 }
53
54 static struct portals_handle_ops mfd_handle_ops = {
55         .hop_addref = mdt_mfd_get,
56         .hop_free   = NULL,
57 };
58
59 /* Create a new mdt_file_data struct, initialize it,
60  * and insert it to global hash table */
61 struct mdt_file_data *mdt_mfd_new(const struct mdt_export_data *med)
62 {
63         struct mdt_file_data *mfd;
64         ENTRY;
65
66         OBD_ALLOC_PTR(mfd);
67         if (mfd != NULL) {
68                 CFS_INIT_LIST_HEAD(&mfd->mfd_handle.h_link);
69                 mfd->mfd_handle.h_owner = med;
70                 CFS_INIT_LIST_HEAD(&mfd->mfd_list);
71                 class_handle_hash(&mfd->mfd_handle, &mfd_handle_ops);
72         }
73
74         RETURN(mfd);
75 }
76
77 /*
78  * Find the mfd pointed to by handle in global hash table.
79  * In case of replay the handle is obsoleted
80  * but mfd can be found in mfd list by that handle
81  */
82 struct mdt_file_data *mdt_handle2mfd(struct mdt_export_data *med,
83                                      const struct lustre_handle *handle,
84                                      bool is_replay_or_resent)
85 {
86         struct mdt_file_data   *mfd;
87         ENTRY;
88
89         LASSERT(handle != NULL);
90         mfd = class_handle2object(handle->cookie, med);
91         /* during dw/setattr replay the mfd can be found by old handle */
92         if (mfd == NULL && is_replay_or_resent) {
93                 cfs_list_for_each_entry(mfd, &med->med_open_head, mfd_list) {
94                         if (mfd->mfd_old_handle.cookie == handle->cookie)
95                                 RETURN(mfd);
96                 }
97                 mfd = NULL;
98         }
99
100         RETURN(mfd);
101 }
102
103 /* free mfd */
104 void mdt_mfd_free(struct mdt_file_data *mfd)
105 {
106         LASSERT(cfs_list_empty(&mfd->mfd_list));
107         OBD_FREE_RCU(mfd, sizeof *mfd, &mfd->mfd_handle);
108 }
109
110 static int mdt_create_data(struct mdt_thread_info *info,
111                            struct mdt_object *p, struct mdt_object *o)
112 {
113         struct md_op_spec     *spec = &info->mti_spec;
114         struct md_attr        *ma   = &info->mti_attr;
115         int                    rc   = 0;
116         ENTRY;
117
118         if (!md_should_create(spec->sp_cr_flags))
119                 RETURN(0);
120
121         ma->ma_need = MA_INODE | MA_LOV;
122         ma->ma_valid = 0;
123         mutex_lock(&o->mot_lov_mutex);
124         if (!(o->mot_flags & MOF_LOV_CREATED)) {
125                 if (p != NULL && (fid_is_obf(mdt_object_fid(p)) ||
126                                   fid_is_dot_lustre(mdt_object_fid(p))))
127                         GOTO(unlock, rc = -EPERM);
128
129                 rc = mdo_create_data(info->mti_env,
130                                      p ? mdt_object_child(p) : NULL,
131                                      mdt_object_child(o), spec, ma);
132                 if (rc == 0)
133                         rc = mdt_attr_get_complex(info, o, ma);
134
135                 if (rc == 0 && ma->ma_valid & MA_LOV)
136                         o->mot_flags |= MOF_LOV_CREATED;
137         }
138 unlock:
139         mutex_unlock(&o->mot_lov_mutex);
140         RETURN(rc);
141 }
142
143 static int mdt_ioepoch_opened(struct mdt_object *mo)
144 {
145         return mo->mot_ioepoch_count;
146 }
147
148 int mdt_object_is_som_enabled(struct mdt_object *mo)
149 {
150         return !mo->mot_ioepoch;
151 }
152
153 /**
154  * Re-enable Size-on-MDS.
155  * Call under ->mot_ioepoch_mutex.
156  */
157 static void mdt_object_som_enable(struct mdt_object *mo, __u64 ioepoch)
158 {
159         if (ioepoch == mo->mot_ioepoch) {
160                 LASSERT(!mdt_ioepoch_opened(mo));
161                 mo->mot_ioepoch = 0;
162                 mo->mot_flags = 0;
163         }
164 }
165
166 /**
167  * Open the IOEpoch. It is allowed if @writecount is not negative.
168  * The epoch and writecount handling is performed under the mot_ioepoch_mutex.
169  */
170 int mdt_ioepoch_open(struct mdt_thread_info *info, struct mdt_object *o,
171                      int created)
172 {
173         struct mdt_device *mdt = info->mti_mdt;
174         int cancel = 0;
175         int rc = 0;
176         ENTRY;
177
178         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
179             !S_ISREG(lu_object_attr(&o->mot_obj)))
180                 RETURN(0);
181
182         mutex_lock(&o->mot_ioepoch_mutex);
183         if (mdt_ioepoch_opened(o)) {
184                 /* Epoch continues even if there is no writers yet. */
185                 CDEBUG(D_INODE, "continue epoch "LPU64" for "DFID"\n",
186                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
187         } else {
188                 /* XXX: ->mdt_ioepoch is not initialized at the mount */
189                 spin_lock(&mdt->mdt_ioepoch_lock);
190                 if (mdt->mdt_ioepoch < info->mti_replayepoch)
191                         mdt->mdt_ioepoch = info->mti_replayepoch;
192
193                 if (info->mti_replayepoch)
194                         o->mot_ioepoch = info->mti_replayepoch;
195                 else if (++mdt->mdt_ioepoch == IOEPOCH_INVAL)
196                         o->mot_ioepoch = ++mdt->mdt_ioepoch;
197                 else
198                         o->mot_ioepoch = mdt->mdt_ioepoch;
199
200                 spin_unlock(&mdt->mdt_ioepoch_lock);
201
202                 CDEBUG(D_INODE, "starting epoch "LPU64" for "DFID"\n",
203                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
204                 if (created)
205                         o->mot_flags |= MOF_SOM_CREATED;
206                 cancel = 1;
207         }
208         o->mot_ioepoch_count++;
209         mutex_unlock(&o->mot_ioepoch_mutex);
210
211         /* Cancel Size-on-MDS attributes cached on clients for the open case.
212          * In the truncate case, see mdt_reint_setattr(). */
213         if (cancel && (info->mti_rr.rr_fid1 != NULL)) {
214                 struct mdt_lock_handle  *lh = &info->mti_lh[MDT_LH_CHILD];
215                 mdt_lock_reg_init(lh, LCK_EX);
216                 rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_UPDATE,
217                                      MDT_LOCAL_LOCK);
218                 if (rc == 0)
219                         mdt_object_unlock(info, o, lh, 1);
220         }
221         RETURN(rc);
222 }
223
224 /**
225  * Update SOM on-disk attributes.
226  * If enabling, write update inodes and lustre-ea with the proper IOEpoch,
227  * mountid and attributes. If disabling, clean SOM xattr.
228  * Call under ->mot_ioepoch_mutex.
229  */
230 static int mdt_som_attr_set(struct mdt_thread_info *info,
231                             struct mdt_object *obj, __u64 ioepoch, bool enable)
232 {
233         struct md_object        *next = mdt_object_child(obj);
234         int                      rc;
235         ENTRY;
236
237         CDEBUG(D_INODE, "Size-on-MDS attribute %s for epoch "LPU64
238                " on "DFID".\n", enable ? "update" : "disabling",
239                ioepoch, PFID(mdt_object_fid(obj)));
240
241         if (enable) {
242                 struct lu_buf           *buf = &info->mti_buf;
243                 struct som_attrs        *attrs;
244                 struct md_attr          *ma = &info->mti_attr;
245                 struct lu_attr          *la = &ma->ma_attr;
246                 struct obd_device       *obd = info->mti_mdt->mdt_lut.lut_obd;
247
248                 attrs = (struct som_attrs *)info->mti_xattr_buf;
249                 CLASSERT(sizeof(info->mti_xattr_buf) >= sizeof(*attrs));
250
251                 /* pack SOM attributes */
252                 memset(attrs, 0, sizeof(*attrs));
253                 attrs->som_ioepoch = ioepoch;
254                 attrs->som_mountid = obd->u.obt.obt_mount_count;
255                 if ((la->la_valid & LA_SIZE) != 0)
256                         attrs->som_size = la->la_size;
257                 if ((la->la_valid & LA_BLOCKS) != 0)
258                         attrs->som_blocks = la->la_blocks;
259                 lustre_som_swab(attrs);
260
261                 /* update SOM attributes */
262                 buf->lb_buf = attrs;
263                 buf->lb_len = sizeof(*attrs);
264                 rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
265         } else {
266                 /* delete SOM attributes */
267                 rc = mo_xattr_del(info->mti_env, next, XATTR_NAME_SOM);
268         }
269
270         RETURN(rc);
271 }
272
273 /** Perform the eviction specific actions on ioepoch close. */
274 static inline int mdt_ioepoch_close_on_eviction(struct mdt_thread_info *info,
275                                                 struct mdt_object *o)
276 {
277         int rc = 0;
278
279         mutex_lock(&o->mot_ioepoch_mutex);
280         CDEBUG(D_INODE, "Eviction. Closing IOepoch "LPU64" on "DFID". "
281                "Count %d\n", o->mot_ioepoch, PFID(mdt_object_fid(o)),
282                o->mot_ioepoch_count);
283         o->mot_ioepoch_count--;
284
285         /* If eviction occured set MOF_SOM_RECOV,
286          * if no other epoch holders, disable SOM on disk. */
287         o->mot_flags |= MOF_SOM_CHANGE | MOF_SOM_RECOV;
288         if (!mdt_ioepoch_opened(o)) {
289                 rc = mdt_som_attr_set(info, o, o->mot_ioepoch, MDT_SOM_DISABLE);
290                 mdt_object_som_enable(o, o->mot_ioepoch);
291         }
292         mutex_unlock(&o->mot_ioepoch_mutex);
293         RETURN(rc);
294 }
295
296 /**
297  * Perform the replay specific actions on ioepoch close.
298  * Skip SOM attribute update if obtained and just forget about the inode state
299  * for the last ioepoch holder. The SOM cache is invalidated on MDS failure.
300  */
301 static inline int mdt_ioepoch_close_on_replay(struct mdt_thread_info *info,
302                                               struct mdt_object *o)
303 {
304         int rc = MDT_IOEPOCH_CLOSED;
305         ENTRY;
306
307         mutex_lock(&o->mot_ioepoch_mutex);
308         CDEBUG(D_INODE, "Replay. Closing epoch "LPU64" on "DFID". Count %d\n",
309                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
310         o->mot_ioepoch_count--;
311
312         /* Get an info from the replayed request if client is supposed
313          * to send an Attibute Update, reconstruct @rc if so */
314         if (info->mti_ioepoch->flags & MF_SOM_AU)
315                 rc = MDT_IOEPOCH_GETATTR;
316
317         if (!mdt_ioepoch_opened(o))
318                 mdt_object_som_enable(o, info->mti_ioepoch->ioepoch);
319         mutex_unlock(&o->mot_ioepoch_mutex);
320
321         RETURN(rc);
322 }
323
324 /**
325  * Regular file IOepoch close.
326  * Closes the ioepoch, checks the object state, apply obtained attributes and
327  * re-enable SOM on the object, if possible. Also checks if the recovery is
328  * needed and packs OBD_MD_FLGETATTRLOCK flag into the reply to force the client
329  * to obtain SOM attributes under the server-side OST locks.
330  *
331  * Return value:
332  * MDT_IOEPOCH_CLOSED if ioepoch is closed.
333  * MDT_IOEPOCH_GETATTR if ioepoch is closed but another SOM update is needed.
334  */
335 static inline int mdt_ioepoch_close_reg(struct mdt_thread_info *info,
336                                         struct mdt_object *o)
337 {
338         struct md_attr *tmp_ma;
339         struct lu_attr *la;
340         int achange, opened;
341         int recovery = 0;
342         int rc = 0, ret = MDT_IOEPOCH_CLOSED;
343         ENTRY;
344
345         la = &info->mti_attr.ma_attr;
346         achange = (info->mti_ioepoch->flags & MF_SOM_CHANGE);
347
348         mutex_lock(&o->mot_ioepoch_mutex);
349         o->mot_ioepoch_count--;
350
351         tmp_ma = &info->mti_u.som.attr;
352         tmp_ma->ma_lmm = info->mti_attr.ma_lmm;
353         tmp_ma->ma_lmm_size = info->mti_attr.ma_lmm_size;
354         tmp_ma->ma_som = &info->mti_u.som.data;
355         tmp_ma->ma_need = MA_INODE | MA_LOV | MA_SOM;
356         tmp_ma->ma_valid = 0;
357         rc = mdt_attr_get_complex(info, o, tmp_ma);
358         if (rc)
359                 GOTO(error_up, rc);
360
361         /* Check the on-disk SOM state. */
362         if (o->mot_flags & MOF_SOM_RECOV)
363                 recovery = 1;
364         else if (!(o->mot_flags & MOF_SOM_CREATED) &&
365                  !(tmp_ma->ma_valid & MA_SOM))
366                 recovery = 1;
367
368         CDEBUG(D_INODE, "Closing epoch "LPU64" on "DFID". Count %d\n",
369                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
370
371         opened = mdt_ioepoch_opened(o);
372         /**
373          * If IOEpoch is not opened, check if a Size-on-MDS update is needed.
374          * Skip the check for file with no LOV  or for unlink files.
375          */
376         if (!opened && tmp_ma->ma_valid & MA_LOV &&
377             !(tmp_ma->ma_valid & MA_INODE && tmp_ma->ma_attr.la_nlink == 0)) {
378                 if (recovery)
379                         /* If some previous writer was evicted, re-ask the
380                          * client for attributes. Even if attributes are
381                          * provided, we cannot believe in them.
382                          * Another use case is that there is no SOM cache on
383                          * disk -- first access with SOM or there was an MDS
384                          * failure. */
385                         ret = MDT_IOEPOCH_GETATTR;
386                 else if (o->mot_flags & MOF_SOM_CHANGE)
387                         /* Some previous writer changed the attribute.
388                          * Do not believe to the current Size-on-MDS
389                          * update, re-ask client. */
390                         ret = MDT_IOEPOCH_GETATTR;
391                 else if (!(la->la_valid & LA_SIZE) && achange)
392                         /* Attributes were changed by the last writer
393                          * only but no Size-on-MDS update is received.*/
394                         ret = MDT_IOEPOCH_GETATTR;
395         }
396
397         if (achange || ret == MDT_IOEPOCH_GETATTR)
398                 o->mot_flags |= MOF_SOM_CHANGE;
399
400         /* If epoch ends and relable SOM attributes are obtained, update them.
401          * Create SOM ea for new files even if there is no attributes obtained
402          * (0-length file). */
403         if (ret == MDT_IOEPOCH_CLOSED && !opened) {
404                 if (achange || o->mot_flags & MOF_SOM_CREATED) {
405                         LASSERT(achange || !(la->la_valid & LA_SIZE));
406                         rc = mdt_som_attr_set(info, o, o->mot_ioepoch,
407                                               MDT_SOM_ENABLE);
408                         /* Avoid the following setattrs of these attributes,
409                          * e.g. for atime update. */
410                         info->mti_attr.ma_valid = 0;
411                 }
412                 mdt_object_som_enable(o, o->mot_ioepoch);
413         }
414
415         mutex_unlock(&o->mot_ioepoch_mutex);
416         /* If recovery is needed, tell the client to perform GETATTR under
417          * the lock. */
418         if (ret == MDT_IOEPOCH_GETATTR && recovery) {
419                 struct mdt_body *rep;
420                 rep = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
421                 rep->valid |= OBD_MD_FLGETATTRLOCK;
422         }
423
424         RETURN(rc ? : ret);
425
426 error_up:
427         mutex_unlock(&o->mot_ioepoch_mutex);
428         return rc;
429 }
430
431 /**
432  * Close IOEpoch (opened file or MDS_FMODE_EPOCH state). It happens if:
433  * - a client closes the IOEpoch;
434  * - a client eviction occured.
435  * Return values:
436  * MDT_IOEPOCH_OPENED if the client does not close IOEpoch.
437  * MDT_IOEPOCH_CLOSED if the client closes IOEpoch.
438  * MDT_IOEPOCH_GETATTR if the client closes IOEpoch but another SOM attribute
439  * update is needed.
440  */
441 static int mdt_ioepoch_close(struct mdt_thread_info *info, struct mdt_object *o)
442 {
443         struct ptlrpc_request *req = mdt_info_req(info);
444         ENTRY;
445
446         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
447             !S_ISREG(lu_object_attr(&o->mot_obj)))
448                 RETURN(0);
449
450         LASSERT(o->mot_ioepoch_count);
451         LASSERT(info->mti_ioepoch == NULL ||
452                 info->mti_ioepoch->ioepoch == o->mot_ioepoch);
453
454         /* IOEpoch is closed only if client tells about it or eviction occures.
455          * In the replay case, always close the epoch. */
456         if (req == NULL)
457                 RETURN(mdt_ioepoch_close_on_eviction(info, o));
458         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
459                 RETURN(mdt_ioepoch_close_on_replay(info, o));
460         if (info->mti_ioepoch->flags & MF_EPOCH_CLOSE)
461                 RETURN(mdt_ioepoch_close_reg(info, o));
462         /* IO epoch is not closed. */
463         RETURN(MDT_IOEPOCH_OPENED);
464 }
465
466 /**
467  * Close MDS_FMODE_SOM state, when IOEpoch is already closed and we are waiting
468  * for attribute update. It happens if:
469  * - SOM Attribute Update is obtained;
470  * - the client failed to obtain it and informs MDS about it;
471  * - a client eviction occured.
472  * Apply obtained attributes for the 1st case, wipe out the on-disk SOM
473  * cache otherwise.
474  */
475 int mdt_som_au_close(struct mdt_thread_info *info, struct mdt_object *o)
476 {
477         struct ptlrpc_request   *req = mdt_info_req(info);
478         __u64                    ioepoch = 0;
479         int                      act = MDT_SOM_ENABLE;
480         int                      rc = 0;
481         ENTRY;
482
483         LASSERT(!req || info->mti_ioepoch);
484         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
485             !S_ISREG(lu_object_attr(&o->mot_obj)))
486                 RETURN(0);
487
488         /* No size whereas MF_SOM_CHANGE is set means client failed to
489          * obtain ost attributes, drop the SOM cache on disk if so. */
490         if (!req ||
491             (info->mti_ioepoch &&
492              info->mti_ioepoch->flags & MF_SOM_CHANGE &&
493              !(info->mti_attr.ma_attr.la_valid & LA_SIZE)))
494                 act = MDT_SOM_DISABLE;
495
496         mutex_lock(&o->mot_ioepoch_mutex);
497         /* Mark the object it is the recovery state if we failed to obtain
498          * SOM attributes. */
499         if (act == MDT_SOM_DISABLE)
500                 o->mot_flags |= MOF_SOM_RECOV;
501
502         if (!mdt_ioepoch_opened(o)) {
503                 ioepoch =  info->mti_ioepoch ?
504                         info->mti_ioepoch->ioepoch : o->mot_ioepoch;
505
506                 if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
507                         rc = mdt_som_attr_set(info, o, ioepoch, act);
508                 mdt_object_som_enable(o, ioepoch);
509         }
510         mutex_unlock(&o->mot_ioepoch_mutex);
511         RETURN(rc);
512 }
513
514 int mdt_write_read(struct mdt_object *o)
515 {
516         int rc = 0;
517         ENTRY;
518         mutex_lock(&o->mot_ioepoch_mutex);
519         rc = o->mot_writecount;
520         mutex_unlock(&o->mot_ioepoch_mutex);
521         RETURN(rc);
522 }
523
524 int mdt_write_get(struct mdt_object *o)
525 {
526         int rc = 0;
527         ENTRY;
528         mutex_lock(&o->mot_ioepoch_mutex);
529         if (o->mot_writecount < 0)
530                 rc = -ETXTBSY;
531         else
532                 o->mot_writecount++;
533         mutex_unlock(&o->mot_ioepoch_mutex);
534         RETURN(rc);
535 }
536
537 void mdt_write_put(struct mdt_object *o)
538 {
539         ENTRY;
540         mutex_lock(&o->mot_ioepoch_mutex);
541         o->mot_writecount--;
542         mutex_unlock(&o->mot_ioepoch_mutex);
543         EXIT;
544 }
545
546 static int mdt_write_deny(struct mdt_object *o)
547 {
548         int rc = 0;
549         ENTRY;
550         mutex_lock(&o->mot_ioepoch_mutex);
551         if (o->mot_writecount > 0)
552                 rc = -ETXTBSY;
553         else
554                 o->mot_writecount--;
555         mutex_unlock(&o->mot_ioepoch_mutex);
556         RETURN(rc);
557 }
558
559 static void mdt_write_allow(struct mdt_object *o)
560 {
561         ENTRY;
562         mutex_lock(&o->mot_ioepoch_mutex);
563         o->mot_writecount++;
564         mutex_unlock(&o->mot_ioepoch_mutex);
565         EXIT;
566 }
567
568 /* there can be no real transaction so prepare the fake one */
569 static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
570 {
571         struct mdt_device      *mdt = info->mti_mdt;
572         struct ptlrpc_request  *req = mdt_info_req(info);
573         struct tg_export_data  *ted;
574         struct lsd_client_data *lcd;
575
576         ENTRY;
577         /* transaction has occurred already */
578         if (lustre_msg_get_transno(req->rq_repmsg) != 0)
579                 RETURN_EXIT;
580
581         spin_lock(&mdt->mdt_lut.lut_translock);
582         if (rc != 0) {
583                 if (info->mti_transno != 0) {
584                         struct obd_export *exp = req->rq_export;
585
586                         CERROR("%s: replay trans "LPU64" NID %s: rc = %d\n",
587                                mdt_obd_name(mdt), info->mti_transno,
588                                libcfs_nid2str(exp->exp_connection->c_peer.nid),
589                                rc);
590                         spin_unlock(&mdt->mdt_lut.lut_translock);
591                         RETURN_EXIT;
592                 }
593         } else if (info->mti_transno == 0) {
594                 info->mti_transno = ++mdt->mdt_lut.lut_last_transno;
595         } else {
596                 /* should be replay */
597                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
598                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
599         }
600         spin_unlock(&mdt->mdt_lut.lut_translock);
601
602         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
603                info->mti_transno,
604                req->rq_export->exp_obd->obd_last_committed);
605
606         req->rq_transno = info->mti_transno;
607         lustre_msg_set_transno(req->rq_repmsg, info->mti_transno);
608
609         /* update lcd in memory only for resent cases */
610         ted = &req->rq_export->exp_target_data;
611         LASSERT(ted);
612         mutex_lock(&ted->ted_lcd_lock);
613         lcd = ted->ted_lcd;
614         if (info->mti_transno < lcd->lcd_last_transno &&
615             info->mti_transno != 0) {
616                 /* This should happen during replay. Do not update
617                  * last rcvd info if replay req transno < last transno,
618                  * otherwise the following resend(after replay) can not
619                  * be checked correctly by xid */
620                 mutex_unlock(&ted->ted_lcd_lock);
621                 CDEBUG(D_HA, "%s: transno = "LPU64" < last_transno = "LPU64"\n",
622                        mdt_obd_name(mdt), info->mti_transno,
623                        lcd->lcd_last_transno);
624                 RETURN_EXIT;
625         }
626
627         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
628             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
629                 if (info->mti_transno != 0)
630                         lcd->lcd_last_close_transno = info->mti_transno;
631                 lcd->lcd_last_close_xid = req->rq_xid;
632                 lcd->lcd_last_close_result = rc;
633         } else {
634                 /* VBR: save versions in last_rcvd for reconstruct. */
635                 __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
636                 if (pre_versions) {
637                         lcd->lcd_pre_versions[0] = pre_versions[0];
638                         lcd->lcd_pre_versions[1] = pre_versions[1];
639                         lcd->lcd_pre_versions[2] = pre_versions[2];
640                         lcd->lcd_pre_versions[3] = pre_versions[3];
641                 }
642                 if (info->mti_transno != 0)
643                         lcd->lcd_last_transno = info->mti_transno;
644
645                 lcd->lcd_last_xid = req->rq_xid;
646                 lcd->lcd_last_result = rc;
647                 lcd->lcd_last_data = info->mti_opdata;
648         }
649         mutex_unlock(&ted->ted_lcd_lock);
650
651         EXIT;
652 }
653
654 void mdt_mfd_set_mode(struct mdt_file_data *mfd, __u64 mode)
655 {
656         LASSERT(mfd != NULL);
657
658         CDEBUG(D_HA, DFID " Change mfd mode "LPO64" -> "LPO64".\n",
659                PFID(mdt_object_fid(mfd->mfd_object)), mfd->mfd_mode, mode);
660
661         mfd->mfd_mode = mode;
662 }
663
664 static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
665                         struct mdt_object *o, __u64 flags, int created)
666 {
667         struct ptlrpc_request   *req = mdt_info_req(info);
668         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
669         struct mdt_file_data    *mfd;
670         struct md_attr          *ma  = &info->mti_attr;
671         struct lu_attr          *la  = &ma->ma_attr;
672         struct mdt_body         *repbody;
673         int                      rc = 0, isdir, isreg;
674         ENTRY;
675
676         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
677
678         isreg = S_ISREG(la->la_mode);
679         isdir = S_ISDIR(la->la_mode);
680         if (isreg && !(ma->ma_valid & MA_LOV) && !(flags & MDS_OPEN_RELEASE)) {
681                 /*
682                  * No EA, check whether it is will set regEA and dirEA since in
683                  * above attr get, these size might be zero, so reset it, to
684                  * retrieve the MD after create obj.
685                  */
686                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
687                                                        &RMF_MDT_MD,
688                                                        RCL_SERVER);
689                 /* in replay case, p == NULL */
690                 rc = mdt_create_data(info, p, o);
691                 if (rc)
692                         RETURN(rc);
693         }
694
695         CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
696                ma->ma_valid, ma->ma_lmm_size);
697
698         if (ma->ma_valid & MA_LOV) {
699                 LASSERT(ma->ma_lmm_size != 0);
700                 repbody->eadatasize = ma->ma_lmm_size;
701                 if (isdir)
702                         repbody->valid |= OBD_MD_FLDIREA;
703                 else
704                         repbody->valid |= OBD_MD_FLEASIZE;
705         }
706
707         if (flags & FMODE_WRITE) {
708                 rc = mdt_write_get(o);
709                 if (rc == 0) {
710                         mdt_ioepoch_open(info, o, created);
711                         repbody->ioepoch = o->mot_ioepoch;
712                 }
713         } else if (flags & MDS_FMODE_EXEC) {
714                 /* if file is released, we can't deny write because we must
715                  * restore (write) it to access it.*/
716                 if ((ma->ma_valid & MA_HSM) &&
717                     (ma->ma_hsm.mh_flags & HS_RELEASED))
718                         rc = 0;
719                 else
720                         rc = mdt_write_deny(o);
721         }
722         if (rc)
723                 RETURN(rc);
724
725         rc = mo_open(info->mti_env, mdt_object_child(o),
726                      created ? flags | MDS_OPEN_CREATED : flags);
727         if (rc)
728                 GOTO(err_out, rc);
729
730         mfd = mdt_mfd_new(med);
731         if (mfd == NULL)
732                 GOTO(err_out, rc = -ENOMEM);
733
734         /*
735          * Keep a reference on this object for this open, and is
736          * released by mdt_mfd_close().
737          */
738         mdt_object_get(info->mti_env, o);
739         mfd->mfd_object = o;
740         mfd->mfd_xid = req->rq_xid;
741
742         /*
743          * @flags is always not zero. At least it should be FMODE_READ,
744          * FMODE_WRITE or MDS_FMODE_EXEC.
745          */
746         LASSERT(flags != 0);
747
748         /* Open handling. */
749         mdt_mfd_set_mode(mfd, flags);
750
751         atomic_inc(&o->mot_open_count);
752         if (flags & MDS_OPEN_LEASE)
753                 atomic_inc(&o->mot_lease_count);
754
755         /* replay handle */
756         if (req_is_replay(req)) {
757                 struct mdt_file_data *old_mfd;
758                 /* Check wheather old cookie already exist in
759                  * the list, becasue when do recovery, client
760                  * might be disconnected from server, and
761                  * restart replay, so there maybe some orphan
762                  * mfd here, we should remove them */
763                 LASSERT(info->mti_rr.rr_handle != NULL);
764                 old_mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle, true);
765                 if (old_mfd != NULL) {
766                         CDEBUG(D_HA, "delete orphan mfd = %p, fid = "DFID", "
767                                "cookie = "LPX64"\n", mfd,
768                                PFID(mdt_object_fid(mfd->mfd_object)),
769                                info->mti_rr.rr_handle->cookie);
770                         spin_lock(&med->med_open_lock);
771                         class_handle_unhash(&old_mfd->mfd_handle);
772                         cfs_list_del_init(&old_mfd->mfd_list);
773                         spin_unlock(&med->med_open_lock);
774                         /* no attr update for that close */
775                         la->la_valid = 0;
776                         ma->ma_valid |= MA_FLAGS;
777                         ma->ma_attr_flags |= MDS_RECOV_OPEN;
778                         mdt_mfd_close(info, old_mfd);
779                         ma->ma_attr_flags &= ~MDS_RECOV_OPEN;
780                         ma->ma_valid &= ~MA_FLAGS;
781                 }
782
783                 CDEBUG(D_HA, "Store old cookie "LPX64" in new mfd\n",
784                        info->mti_rr.rr_handle->cookie);
785
786                 mfd->mfd_old_handle.cookie = info->mti_rr.rr_handle->cookie;
787         }
788
789         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
790
791         if (req->rq_export->exp_disconnected) {
792                 spin_lock(&med->med_open_lock);
793                 class_handle_unhash(&mfd->mfd_handle);
794                 cfs_list_del_init(&mfd->mfd_list);
795                 spin_unlock(&med->med_open_lock);
796                 mdt_mfd_close(info, mfd);
797         } else {
798                 spin_lock(&med->med_open_lock);
799                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
800                 spin_unlock(&med->med_open_lock);
801         }
802
803         mdt_empty_transno(info, rc);
804
805         RETURN(rc);
806
807 err_out:
808         if (flags & FMODE_WRITE)
809                         /* XXX We also need to close io epoch here.
810                          * See LU-1220 - green */
811                 mdt_write_put(o);
812         else if (flags & FMODE_EXEC)
813                 mdt_write_allow(o);
814         return rc;
815 }
816
817 int mdt_finish_open(struct mdt_thread_info *info,
818                     struct mdt_object *p, struct mdt_object *o,
819                     __u64 flags, int created, struct ldlm_reply *rep)
820 {
821         struct ptlrpc_request   *req = mdt_info_req(info);
822         struct obd_export       *exp = req->rq_export;
823         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
824         struct md_attr          *ma  = &info->mti_attr;
825         struct lu_attr          *la  = &ma->ma_attr;
826         struct mdt_file_data    *mfd;
827         struct mdt_body         *repbody;
828         int                      rc = 0;
829         int                      isreg, isdir, islnk;
830         cfs_list_t              *t;
831         ENTRY;
832
833         LASSERT(ma->ma_valid & MA_INODE);
834
835         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
836
837         isreg = S_ISREG(la->la_mode);
838         isdir = S_ISDIR(la->la_mode);
839         islnk = S_ISLNK(la->la_mode);
840         mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o));
841
842         /* LU-2275, simulate broken behaviour (esp. prevalent in
843          * pre-2.4 servers where a very strange reply is sent on error
844          * that looks like it was actually almost succesful and a failure at the
845          * same time */
846         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_NEGATIVE_POSITIVE)) {
847                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN |
848                                                DISP_LOOKUP_NEG |
849                                                DISP_LOOKUP_POS);
850
851                 if (flags & MDS_OPEN_LOCK)
852                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
853
854                 RETURN(-ENOENT);
855         }
856
857         if (exp_connect_rmtclient(exp)) {
858                 void *buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
859
860                 rc = mdt_pack_remote_perm(info, o, buf);
861                 if (rc) {
862                         repbody->valid &= ~OBD_MD_FLRMTPERM;
863                         repbody->aclsize = 0;
864                 } else {
865                         repbody->valid |= OBD_MD_FLRMTPERM;
866                         repbody->aclsize = sizeof(struct mdt_remote_perm);
867                 }
868         }
869 #ifdef CONFIG_FS_POSIX_ACL
870         else if (exp_connect_flags(exp) & OBD_CONNECT_ACL) {
871                 const struct lu_env *env = info->mti_env;
872                 struct md_object *next = mdt_object_child(o);
873                 struct lu_buf *buf = &info->mti_buf;
874
875                 buf->lb_buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
876                 buf->lb_len = req_capsule_get_size(info->mti_pill, &RMF_ACL,
877                                                    RCL_SERVER);
878                 if (buf->lb_len > 0) {
879                         rc = mo_xattr_get(env, next, buf,
880                                           XATTR_NAME_ACL_ACCESS);
881                         if (rc < 0) {
882                                 if (rc == -ENODATA) {
883                                         repbody->aclsize = 0;
884                                         repbody->valid |= OBD_MD_FLACL;
885                                         rc = 0;
886                                 } else if (rc == -EOPNOTSUPP) {
887                                         rc = 0;
888                                 } else {
889                                         CERROR("got acl size: %d\n", rc);
890                                 }
891                         } else {
892                                 repbody->aclsize = rc;
893                                 repbody->valid |= OBD_MD_FLACL;
894                                 rc = 0;
895                         }
896                 }
897         }
898 #endif
899
900         if (info->mti_mdt->mdt_opts.mo_mds_capa &&
901             exp_connect_flags(exp) & OBD_CONNECT_MDS_CAPA) {
902                 struct lustre_capa *capa;
903
904                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1);
905                 LASSERT(capa);
906                 capa->lc_opc = CAPA_OPC_MDS_DEFAULT;
907                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
908                 if (rc)
909                         RETURN(rc);
910                 repbody->valid |= OBD_MD_FLMDSCAPA;
911         }
912
913         if (info->mti_mdt->mdt_opts.mo_oss_capa &&
914             exp_connect_flags(exp) & OBD_CONNECT_OSS_CAPA &&
915             S_ISREG(lu_object_attr(&o->mot_obj))) {
916                 struct lustre_capa *capa;
917
918                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2);
919                 LASSERT(capa);
920                 capa->lc_opc = CAPA_OPC_OSS_DEFAULT | capa_open_opc(flags);
921                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
922                 if (rc)
923                         RETURN(rc);
924                 repbody->valid |= OBD_MD_FLOSSCAPA;
925         }
926
927         /*
928          * If we are following a symlink, don't open; and do not return open
929          * handle for special nodes as client required.
930          */
931         if (islnk || (!isreg && !isdir &&
932             (exp_connect_flags(req->rq_export) & OBD_CONNECT_NODEVOH))) {
933                 lustre_msg_set_transno(req->rq_repmsg, 0);
934                 RETURN(0);
935         }
936
937         /*
938          * We need to return the existing object's fid back, so it is done here,
939          * after preparing the reply.
940          */
941         if (!created && (flags & MDS_OPEN_EXCL) && (flags & MDS_OPEN_CREAT))
942                 RETURN(-EEXIST);
943
944         /* This can't be done earlier, we need to return reply body */
945         if (isdir) {
946                 if (flags & (MDS_OPEN_CREAT | FMODE_WRITE)) {
947                         /* We are trying to create or write an existing dir. */
948                         RETURN(-EISDIR);
949                 }
950         } else if (flags & MDS_OPEN_DIRECTORY)
951                 RETURN(-ENOTDIR);
952
953         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_OPEN_CREATE,
954                                  OBD_FAIL_LDLM_REPLY | OBD_FAIL_ONCE)) {
955                 RETURN(-EAGAIN);
956         }
957
958         mfd = NULL;
959         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
960                 spin_lock(&med->med_open_lock);
961                 cfs_list_for_each(t, &med->med_open_head) {
962                         mfd = cfs_list_entry(t, struct mdt_file_data, mfd_list);
963                         if (mfd->mfd_xid == req->rq_xid)
964                                 break;
965                         mfd = NULL;
966                 }
967                 spin_unlock(&med->med_open_lock);
968
969                 if (mfd != NULL) {
970                         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
971                         /*set repbody->ea_size for resent case*/
972                         if (ma->ma_valid & MA_LOV) {
973                                 LASSERT(ma->ma_lmm_size != 0);
974                                 repbody->eadatasize = ma->ma_lmm_size;
975                                 if (isdir)
976                                         repbody->valid |= OBD_MD_FLDIREA;
977                                 else
978                                         repbody->valid |= OBD_MD_FLEASIZE;
979                         }
980                         mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
981                         RETURN(0);
982                 }
983         }
984
985         rc = mdt_mfd_open(info, p, o, flags, created);
986         if (!rc)
987                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
988
989         RETURN(rc);
990 }
991
992 extern void mdt_req_from_lcd(struct ptlrpc_request *req,
993                              struct lsd_client_data *lcd);
994
995 void mdt_reconstruct_open(struct mdt_thread_info *info,
996                           struct mdt_lock_handle *lhc)
997 {
998         const struct lu_env *env = info->mti_env;
999         struct mdt_device       *mdt  = info->mti_mdt;
1000         struct req_capsule      *pill = info->mti_pill;
1001         struct ptlrpc_request   *req  = mdt_info_req(info);
1002         struct tg_export_data   *ted  = &req->rq_export->exp_target_data;
1003         struct lsd_client_data  *lcd  = ted->ted_lcd;
1004         struct md_attr          *ma   = &info->mti_attr;
1005         struct mdt_reint_record *rr   = &info->mti_rr;
1006         __u32                   flags = info->mti_spec.sp_cr_flags;
1007         struct ldlm_reply       *ldlm_rep;
1008         struct mdt_object       *parent;
1009         struct mdt_object       *child;
1010         struct mdt_body         *repbody;
1011         int                      rc;
1012         ENTRY;
1013
1014         LASSERT(pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1015         ldlm_rep = req_capsule_server_get(pill, &RMF_DLM_REP);
1016         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1017
1018         ma->ma_lmm = req_capsule_server_get(pill, &RMF_MDT_MD);
1019         ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_MDT_MD,
1020                                                RCL_SERVER);
1021         ma->ma_need = MA_INODE | MA_HSM;
1022         if (ma->ma_lmm_size > 0)
1023                 ma->ma_need |= MA_LOV;
1024
1025         ma->ma_valid = 0;
1026
1027         mdt_req_from_lcd(req, lcd);
1028         mdt_set_disposition(info, ldlm_rep, lcd->lcd_last_data);
1029
1030         CDEBUG(D_INODE, "This is reconstruct open: disp="LPX64", result=%d\n",
1031                ldlm_rep->lock_policy_res1, req->rq_status);
1032
1033         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE) &&
1034             req->rq_status != 0)
1035                 /* We did not create successfully, return error to client. */
1036                 GOTO(out, rc = req->rq_status);
1037
1038         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE)) {
1039                 struct obd_export *exp = req->rq_export;
1040                 /*
1041                  * We failed after creation, but we do not know in which step
1042                  * we failed. So try to check the child object.
1043                  */
1044                 parent = mdt_object_find(env, mdt, rr->rr_fid1);
1045                 if (IS_ERR(parent)) {
1046                         rc = PTR_ERR(parent);
1047                         LCONSOLE_WARN("Parent "DFID" lookup error %d."
1048                                       " Evicting client %s with export %s.\n",
1049                                       PFID(rr->rr_fid1), rc,
1050                                       obd_uuid2str(&exp->exp_client_uuid),
1051                                       obd_export_nid2str(exp));
1052                         mdt_export_evict(exp);
1053                         RETURN_EXIT;
1054                 }
1055                 child = mdt_object_find(env, mdt, rr->rr_fid2);
1056                 if (IS_ERR(child)) {
1057                         rc = PTR_ERR(child);
1058                         LCONSOLE_WARN("Child "DFID" lookup error %d."
1059                                       " Evicting client %s with export %s.\n",
1060                                       PFID(mdt_object_fid(child)), rc,
1061                                       obd_uuid2str(&exp->exp_client_uuid),
1062                                       obd_export_nid2str(exp));
1063                         mdt_object_put(env, parent);
1064                         mdt_export_evict(exp);
1065                         RETURN_EXIT;
1066                 }
1067
1068                 if (unlikely(mdt_object_remote(child))) {
1069                         /* the child object was created on remote server */
1070                         if (!mdt_is_dne_client(exp)) {
1071                                 /* Return -EIO for old client */
1072                                 mdt_object_put(env, parent);
1073                                 mdt_object_put(env, child);
1074                                 GOTO(out, rc = -EIO);
1075                         }
1076                         repbody->fid1 = *rr->rr_fid2;
1077                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1078                         rc = 0;
1079                 } else {
1080                         if (mdt_object_exists(child)) {
1081                                 mdt_set_capainfo(info, 1, rr->rr_fid2,
1082                                                  BYPASS_CAPA);
1083                                 rc = mdt_attr_get_complex(info, child, ma);
1084                                 if (rc == 0)
1085                                         rc = mdt_finish_open(info, parent,
1086                                                              child, flags,
1087                                                              1, ldlm_rep);
1088                         } else {
1089                                 /* the child does not exist, we should do
1090                                  * regular open */
1091                                 mdt_object_put(env, parent);
1092                                 mdt_object_put(env, child);
1093                                 GOTO(regular_open, 0);
1094                         }
1095                 }
1096                 mdt_object_put(env, parent);
1097                 mdt_object_put(env, child);
1098                 GOTO(out, rc);
1099         } else {
1100 regular_open:
1101                 /* We did not try to create, so we are a pure open */
1102                 rc = mdt_reint_open(info, lhc);
1103         }
1104
1105         EXIT;
1106 out:
1107         req->rq_status = rc;
1108         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
1109         LASSERT(ergo(rc < 0, lustre_msg_get_transno(req->rq_repmsg) == 0));
1110 }
1111
1112 int mdt_open_by_fid(struct mdt_thread_info* info,
1113                     struct ldlm_reply *rep)
1114 {
1115         __u32                    flags = info->mti_spec.sp_cr_flags;
1116         struct mdt_reint_record *rr = &info->mti_rr;
1117         struct md_attr          *ma = &info->mti_attr;
1118         struct mdt_object       *o;
1119         int                      rc;
1120         ENTRY;
1121
1122         o = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid2);
1123         if (IS_ERR(o))
1124                 RETURN(rc = PTR_ERR(o));
1125
1126         if (unlikely(mdt_object_remote(o))) {
1127                 /* the child object was created on remote server */
1128                 struct mdt_body *repbody;
1129
1130                 mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1131                                                 DISP_LOOKUP_EXECD |
1132                                                 DISP_LOOKUP_POS));
1133                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1134                 repbody->fid1 = *rr->rr_fid2;
1135                 repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1136                 rc = 0;
1137         } else {
1138                 if (mdt_object_exists(o)) {
1139                         mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1140                                                         DISP_LOOKUP_EXECD |
1141                                                         DISP_LOOKUP_POS));
1142
1143                         rc = mdt_attr_get_complex(info, o, ma);
1144                         if (rc == 0)
1145                                 rc = mdt_finish_open(info, NULL, o, flags, 0,
1146                                                      rep);
1147                 } else {
1148                         rc = -ENOENT;
1149                 }
1150         }
1151
1152         mdt_object_put(info->mti_env, o);
1153         RETURN(rc);
1154 }
1155
1156 /* lock object for open */
1157 static int mdt_object_open_lock(struct mdt_thread_info *info,
1158                                 struct mdt_object *obj,
1159                                 struct mdt_lock_handle *lhc,
1160                                 __u64 *ibits)
1161 {
1162         struct md_attr  *ma = &info->mti_attr;
1163         __u64            open_flags = info->mti_spec.sp_cr_flags;
1164         ldlm_mode_t      lm = LCK_CR;
1165         bool             acq_lease = !!(open_flags & MDS_OPEN_LEASE);
1166         bool             try_layout = false;
1167         bool             create_layout = false;
1168         int              rc = 0;
1169         ENTRY;
1170
1171         *ibits = 0;
1172         mdt_lock_handle_init(lhc);
1173
1174         if (req_is_replay(mdt_info_req(info)))
1175                 RETURN(0);
1176
1177         if (S_ISREG(lu_object_attr(&obj->mot_obj))) {
1178                 if (ma->ma_need & MA_LOV && !(ma->ma_valid & MA_LOV) &&
1179                     md_should_create(open_flags))
1180                         create_layout = true;
1181                 if (exp_connect_layout(info->mti_exp) && !create_layout &&
1182                     ma->ma_need & MA_LOV)
1183                         try_layout = true;
1184         }
1185
1186         if (acq_lease) {
1187                 /* lease open, acquire write mode of open sem */
1188                 down_write(&obj->mot_open_sem);
1189
1190                 /* Lease exists and ask for new lease */
1191                 if (atomic_read(&obj->mot_lease_count) > 0) {
1192                         /* only exclusive open is supported, so lease
1193                          * are conflicted to each other */
1194                         GOTO(out, rc = -EBUSY);
1195                 }
1196
1197                 /* Lease must be with open lock */
1198                 if (!(open_flags & MDS_OPEN_LOCK)) {
1199                         CERROR("Request lease for file:"DFID ", but open lock "
1200                                 "is missed, open_flags = "LPO64".\n",
1201                                 PFID(mdt_object_fid(obj)), open_flags);
1202                         GOTO(out, rc = -EPROTO);
1203                 }
1204
1205                 /* XXX: only exclusive open is supported. */
1206                 lm = LCK_EX;
1207                 *ibits = MDS_INODELOCK_OPEN;
1208
1209                 /* never grant LCK_EX layout lock to client */
1210                 try_layout = false;
1211         } else { /* normal open */
1212                 /* normal open holds read mode of open sem */
1213                 down_read(&obj->mot_open_sem);
1214
1215                 if (open_flags & MDS_OPEN_LOCK) {
1216                         if (open_flags & FMODE_WRITE)
1217                                 lm = LCK_CW;
1218                         /* if file is released, we can't deny write because we must
1219                          * restore (write) it to access it. */
1220                         else if ((open_flags & MDS_FMODE_EXEC) &&
1221                                  !((ma->ma_valid & MA_HSM) &&
1222                                    (ma->ma_hsm.mh_flags & HS_RELEASED)))
1223                                 lm = LCK_PR;
1224                         else
1225                                 lm = LCK_CR;
1226
1227                         *ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN;
1228                 } else if (atomic_read(&obj->mot_lease_count) > 0) {
1229                         if (open_flags & FMODE_WRITE)
1230                                 lm = LCK_CW;
1231                         else
1232                                 lm = LCK_CR;
1233
1234                         /* revoke lease */
1235                         *ibits = MDS_INODELOCK_OPEN;
1236                         try_layout = false;
1237
1238                         lhc = &info->mti_lh[MDT_LH_LOCAL];
1239                 }
1240                 CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
1241                         PFID(mdt_object_fid(obj)),
1242                         atomic_read(&obj->mot_open_count), lm);
1243         }
1244
1245         mdt_lock_reg_init(lhc, lm);
1246
1247         /* one problem to return layout lock on open is that it may result
1248          * in too many layout locks cached on the client side. */
1249         if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_OPEN) && try_layout) {
1250                 /* return lookup lock to validate inode at the client side,
1251                  * this is pretty important otherwise mdt will return layout
1252                  * lock for each open.
1253                  * However this is a double-edged sword because changing
1254                  * permission will revoke huge # of LOOKUP locks. */
1255                 *ibits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
1256                 if (!mdt_object_lock_try(info, obj, lhc, *ibits,
1257                                          MDT_CROSS_LOCK)) {
1258                         *ibits &= ~(MDS_INODELOCK_LAYOUT|MDS_INODELOCK_LOOKUP);
1259                         if (*ibits != 0)
1260                                 rc = mdt_object_lock(info, obj, lhc, *ibits,
1261                                                 MDT_CROSS_LOCK);
1262                 }
1263         } else if (*ibits != 0) {
1264                 rc = mdt_object_lock(info, obj, lhc, *ibits, MDT_CROSS_LOCK);
1265         }
1266
1267         CDEBUG(D_INODE, "Requested bits lock:"DFID ", ibits = "LPX64
1268                 ", open_flags = "LPO64", try_layout = %d, rc = %d\n",
1269                 PFID(mdt_object_fid(obj)), *ibits, open_flags, try_layout, rc);
1270
1271         /* will change layout, revoke layout locks by enqueuing EX lock. */
1272         if (rc == 0 && create_layout) {
1273                 struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LAYOUT];
1274
1275                 CDEBUG(D_INODE, "Will create layout, get EX layout lock:"DFID
1276                         ", open_flags = "LPO64"\n",
1277                         PFID(mdt_object_fid(obj)), open_flags);
1278
1279                 LASSERT(!try_layout);
1280                 mdt_lock_handle_init(ll);
1281                 mdt_lock_reg_init(ll, LCK_EX);
1282                 rc = mdt_object_lock(info, obj, ll, MDS_INODELOCK_LAYOUT,
1283                                         MDT_LOCAL_LOCK);
1284
1285                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LL_BLOCK, 2);
1286         }
1287
1288         /* Check if there is any other open handles after acquiring
1289          * open lock. At this point, caching open handles have been revoked
1290          * by open lock.
1291          * XXX: Now only exclusive open is supported. Need to check the
1292          * type of open for generic lease support. */
1293         if (rc == 0 && acq_lease) {
1294                 struct ptlrpc_request *req = mdt_info_req(info);
1295                 struct mdt_export_data *med = &req->rq_export->exp_mdt_data;
1296                 struct mdt_file_data *mfd;
1297                 bool is_replay_or_resent;
1298                 int open_count = 0;
1299
1300                 /* For lease: application can open a file and then apply lease,
1301                  * @handle contains original open handle in that case.
1302                  * In recovery, open REQ will be replayed and the lease REQ may
1303                  * be resent that means the open handle is already stale, so we
1304                  * need to fix it up here by finding new handle. */
1305                 is_replay_or_resent = req_is_replay(req) ||
1306                         lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT;
1307
1308                 /* if the request is _not_ a replay request, rr_handle
1309                  * may be used to hold an openhandle which is issuing the
1310                  * lease request, so that this openhandle doesn't count. */
1311                 mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle,
1312                                      is_replay_or_resent);
1313                 if (mfd != NULL)
1314                         ++open_count;
1315
1316                 CDEBUG(D_INODE, "acq_lease "DFID": openers: %d, want: %d\n",
1317                         PFID(mdt_object_fid(obj)),
1318                         atomic_read(&obj->mot_open_count), open_count);
1319
1320                 if (atomic_read(&obj->mot_open_count) > open_count)
1321                         GOTO(out, rc = -EBUSY);
1322         }
1323         GOTO(out, rc);
1324
1325 out:
1326         RETURN(rc);
1327 }
1328
1329 static void mdt_object_open_unlock(struct mdt_thread_info *info,
1330                                    struct mdt_object *obj,
1331                                    struct mdt_lock_handle *lhc,
1332                                    __u64 ibits, int rc)
1333 {
1334         __u64 open_flags = info->mti_spec.sp_cr_flags;
1335         struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LOCAL];
1336         ENTRY;
1337
1338         if (req_is_replay(mdt_info_req(info)))
1339                 RETURN_EXIT;
1340
1341         /* Release local lock - the lock put in MDT_LH_LOCAL will never
1342          * return to client side. */
1343         if (lustre_handle_is_used(&ll->mlh_reg_lh))
1344                 mdt_object_unlock(info, obj, ll, 1);
1345
1346         ll = &info->mti_lh[MDT_LH_LAYOUT];
1347         /* Release local layout lock, layout was created */
1348         if (lustre_handle_is_used(&ll->mlh_reg_lh)) {
1349                 LASSERT(!(ibits & MDS_INODELOCK_LAYOUT));
1350                 mdt_object_unlock(info, obj, ll, 1);
1351         }
1352
1353         if (open_flags & MDS_OPEN_LEASE)
1354                 up_write(&obj->mot_open_sem);
1355         else
1356                 up_read(&obj->mot_open_sem);
1357
1358         /* Cross-ref case, the lock should be returned to the client */
1359         if (ibits == 0 || rc == -EREMOTE)
1360                 RETURN_EXIT;
1361
1362         if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
1363                 /* for the open request, the lock will only return to client
1364                  * if open or layout lock is granted. */
1365                 rc = 1;
1366         }
1367
1368         if (rc != 0) {
1369                 struct ldlm_reply       *ldlm_rep;
1370
1371                 ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1372                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1373                 mdt_object_unlock(info, obj, lhc, 1);
1374         }
1375         RETURN_EXIT;
1376 }
1377
1378 /**
1379  * Check release is permitted for the current HSM flags.
1380  */
1381 static bool mdt_hsm_release_allow(struct md_attr *ma)
1382 {
1383         if (!(ma->ma_valid & MA_HSM))
1384                 return false;
1385
1386         if (ma->ma_hsm.mh_flags & (HS_DIRTY|HS_NORELEASE|HS_LOST))
1387                 return false;
1388
1389         if (!(ma->ma_hsm.mh_flags & HS_ARCHIVED))
1390                 return false;
1391
1392         return true;
1393 }
1394
1395 int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep,
1396                          struct mdt_lock_handle *lhc)
1397 {
1398         const struct lu_env     *env   = info->mti_env;
1399         struct mdt_device       *mdt   = info->mti_mdt;
1400         __u64                    flags = info->mti_spec.sp_cr_flags;
1401         struct mdt_reint_record *rr    = &info->mti_rr;
1402         struct md_attr          *ma    = &info->mti_attr;
1403         struct mdt_object       *parent= NULL;
1404         struct mdt_object       *o;
1405         int                      rc;
1406         __u64                    ibits = 0;
1407         ENTRY;
1408
1409         if (md_should_create(flags) && !(flags & MDS_OPEN_HAS_EA)) {
1410                 if (!lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
1411                         parent = mdt_object_find(env, mdt, rr->rr_fid1);
1412                         if (IS_ERR(parent)) {
1413                                 CDEBUG(D_INODE, "Fail to find parent "DFID
1414                                        " for anonymous created %ld, try to"
1415                                        " use server-side parent.\n",
1416                                        PFID(rr->rr_fid1), PTR_ERR(parent));
1417                                 parent = NULL;
1418                         }
1419                 }
1420                 if (parent == NULL)
1421                         ma->ma_need |= MA_PFID;
1422         }
1423
1424         o = mdt_object_find(env, mdt, rr->rr_fid2);
1425         if (IS_ERR(o))
1426                 RETURN(rc = PTR_ERR(o));
1427
1428         if (mdt_object_remote(o)) {
1429                 CDEBUG(D_INFO, "%s: "DFID" is on remote MDT.\n",
1430                        mdt_obd_name(info->mti_mdt),
1431                        PFID(rr->rr_fid2));
1432                 GOTO(out, rc = -EREMOTE);
1433         } else if (!mdt_object_exists(o)) {
1434                 mdt_set_disposition(info, rep,
1435                                     DISP_IT_EXECD |
1436                                     DISP_LOOKUP_EXECD |
1437                                     DISP_LOOKUP_NEG);
1438                 GOTO(out, rc = -ENOENT);
1439         }
1440
1441         mdt_set_disposition(info, rep, (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1442
1443         if (flags & MDS_OPEN_RELEASE)
1444                 ma->ma_need |= MA_HSM;
1445         rc = mdt_attr_get_complex(info, o, ma);
1446         if (rc)
1447                 GOTO(out, rc);
1448
1449         /* If a release request, check file flags are fine and ask for an
1450          * exclusive open access. */
1451         if (flags & MDS_OPEN_RELEASE && !mdt_hsm_release_allow(ma))
1452                 GOTO(out, rc = -EPERM);
1453
1454         rc = mdt_object_open_lock(info, o, lhc, &ibits);
1455         if (rc)
1456                 GOTO(out_unlock, rc);
1457
1458         if (ma->ma_valid & MA_PFID) {
1459                 parent = mdt_object_find(env, mdt, &ma->ma_pfid);
1460                 if (IS_ERR(parent)) {
1461                         CDEBUG(D_INODE, "Fail to find parent "DFID
1462                                " for anonymous created %ld, try to"
1463                                " use system default.\n",
1464                                PFID(&ma->ma_pfid), PTR_ERR(parent));
1465                         parent = NULL;
1466                 }
1467         }
1468
1469         rc = mdt_finish_open(info, parent, o, flags, 0, rep);
1470         if (!rc) {
1471                 mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
1472                 if (flags & MDS_OPEN_LOCK)
1473                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
1474                 if (flags & MDS_OPEN_LEASE)
1475                         mdt_set_disposition(info, rep, DISP_OPEN_LEASE);
1476         }
1477         GOTO(out_unlock, rc);
1478
1479 out_unlock:
1480         mdt_object_open_unlock(info, o, lhc, ibits, rc);
1481 out:
1482         mdt_object_put(env, o);
1483         if (parent != NULL)
1484                 mdt_object_put(env, parent);
1485         return rc;
1486 }
1487
1488 int mdt_pin(struct mdt_thread_info* info)
1489 {
1490         ENTRY;
1491         RETURN(err_serious(-EOPNOTSUPP));
1492 }
1493
1494 /* Cross-ref request. Currently it can only be a pure open (w/o create) */
1495 static int mdt_cross_open(struct mdt_thread_info *info,
1496                           const struct lu_fid *parent_fid,
1497                           const struct lu_fid *fid,
1498                           struct ldlm_reply *rep, __u32 flags)
1499 {
1500         struct md_attr    *ma = &info->mti_attr;
1501         struct mdt_object *o;
1502         int                rc;
1503         ENTRY;
1504
1505         o = mdt_object_find(info->mti_env, info->mti_mdt, fid);
1506         if (IS_ERR(o))
1507                 RETURN(rc = PTR_ERR(o));
1508
1509         if (mdt_object_remote(o)) {
1510                 /* Something is wrong here, the object is on another MDS! */
1511                 CERROR("%s: "DFID" isn't on this server!: rc = %d\n",
1512                        mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1513                 LU_OBJECT_DEBUG(D_WARNING, info->mti_env,
1514                                 &o->mot_obj,
1515                                 "Object isn't on this server! FLD error?\n");
1516                 rc = -EFAULT;
1517         } else {
1518                 if (mdt_object_exists(o)) {
1519                         /* Do permission check for cross-open. */
1520                         rc = mo_permission(info->mti_env, NULL,
1521                                            mdt_object_child(o),
1522                                            NULL, flags | MDS_OPEN_CROSS);
1523                         if (rc)
1524                                 goto out;
1525
1526                         mdt_set_capainfo(info, 0, fid, BYPASS_CAPA);
1527                         rc = mdt_attr_get_complex(info, o, ma);
1528                         if (rc != 0)
1529                                 GOTO(out, rc);
1530
1531                         /* Do not create lov object if the fid is opened
1532                          * under OBF */
1533                         if (S_ISREG(ma->ma_attr.la_mode) &&
1534                             !(ma->ma_valid & MA_LOV) && (flags & FMODE_WRITE) &&
1535                             fid_is_obf(parent_fid))
1536                                 GOTO(out, rc = -EPERM);
1537
1538                         rc = mdt_finish_open(info, NULL, o, flags, 0, rep);
1539                 } else {
1540                         /*
1541                          * Something is wrong here. lookup was positive but
1542                          * there is no object!
1543                          */
1544                         CERROR("%s: "DFID" doesn't exist!: rc = %d\n",
1545                               mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1546                         rc = -EFAULT;
1547                 }
1548         }
1549 out:
1550         mdt_object_put(info->mti_env, o);
1551         RETURN(rc);
1552 }
1553
1554 int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
1555 {
1556         struct mdt_device       *mdt = info->mti_mdt;
1557         struct ptlrpc_request   *req = mdt_info_req(info);
1558         struct mdt_object       *parent;
1559         struct mdt_object       *child;
1560         struct mdt_lock_handle  *lh;
1561         struct ldlm_reply       *ldlm_rep;
1562         struct mdt_body         *repbody;
1563         struct lu_fid           *child_fid = &info->mti_tmp_fid1;
1564         struct md_attr          *ma = &info->mti_attr;
1565         __u64                    create_flags = info->mti_spec.sp_cr_flags;
1566         __u64                    ibits;
1567         struct mdt_reint_record *rr = &info->mti_rr;
1568         struct lu_name          *lname;
1569         int                      result, rc;
1570         int                      created = 0;
1571         __u32                    msg_flags;
1572         ENTRY;
1573
1574         OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_MDS_PAUSE_OPEN, OBD_FAIL_ONCE,
1575                                (obd_timeout + 1) / 4);
1576
1577         mdt_counter_incr(req, LPROC_MDT_OPEN);
1578         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1579
1580         ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD);
1581         ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD,
1582                                                RCL_SERVER);
1583         ma->ma_need = MA_INODE;
1584         if (ma->ma_lmm_size > 0)
1585                 ma->ma_need |= MA_LOV;
1586
1587         ma->ma_valid = 0;
1588
1589         LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1590         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1591
1592         if (unlikely(create_flags & MDS_OPEN_JOIN_FILE)) {
1593                 CERROR("file join is not supported anymore.\n");
1594                 GOTO(out, result = err_serious(-EOPNOTSUPP));
1595         }
1596         msg_flags = lustre_msg_get_flags(req->rq_reqmsg);
1597
1598         if ((create_flags & (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS)) &&
1599             info->mti_spec.u.sp_ea.eadata == NULL)
1600                 GOTO(out, result = err_serious(-EINVAL));
1601
1602         CDEBUG(D_INODE, "I am going to open "DFID"/(%s->"DFID") "
1603                "cr_flag="LPO64" mode=0%06o msg_flag=0x%x\n",
1604                PFID(rr->rr_fid1), rr->rr_name,
1605                PFID(rr->rr_fid2), create_flags,
1606                ma->ma_attr.la_mode, msg_flags);
1607         if (info->mti_cross_ref) {
1608                 /* This is cross-ref open */
1609                 mdt_set_disposition(info, ldlm_rep,
1610                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD |
1611                              DISP_LOOKUP_POS));
1612                 result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
1613                                         ldlm_rep, create_flags);
1614                 GOTO(out, result);
1615         } else if (req_is_replay(req) ||
1616             (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
1617                 /* This is a replay request or from liblustre with ea. */
1618                 result = mdt_open_by_fid(info, ldlm_rep);
1619
1620                 if (result != -ENOENT) {
1621                         if (req->rq_export->exp_libclient &&
1622                             create_flags & MDS_OPEN_HAS_EA)
1623                                 GOTO(out, result = 0);
1624                         GOTO(out, result);
1625                 }
1626                 /* We didn't find the correct object, so we need to re-create it
1627                  * via a regular replay. */
1628                 if (!(create_flags & MDS_OPEN_CREAT)) {
1629                         DEBUG_REQ(D_ERROR, req,
1630                                   "OPEN & CREAT not in open replay/by_fid.");
1631                         GOTO(out, result = -EFAULT);
1632                 }
1633                 CDEBUG(D_INFO, "No object(1), continue as regular open.\n");
1634         } else if ((rr->rr_namelen == 0 && create_flags & MDS_OPEN_LOCK) ||
1635                    (create_flags & MDS_OPEN_BY_FID)) {
1636                 result = mdt_open_by_fid_lock(info, ldlm_rep, lhc);
1637                 /* If result is 0 then open by FID has found the file
1638                  * and there is nothing left for us to do here.  More
1639                  * generally if it is anything other than -ENOENT or
1640                  * -EREMOTE then we return that now.  If -ENOENT and
1641                  * MDS_OPEN_CREAT is set then we must create the file
1642                  * below.  If -EREMOTE then we need to return a LOOKUP
1643                  * lock to the client, which we do below.  Hence this
1644                  * odd looking condition.  See LU-2523. */
1645                 if (!(result == -ENOENT && (create_flags & MDS_OPEN_CREAT)) &&
1646                     result != -EREMOTE)
1647                         GOTO(out, result);
1648
1649                 if (unlikely(rr->rr_namelen == 0))
1650                         GOTO(out, result = -EINVAL);
1651
1652                 CDEBUG(D_INFO, "No object(2), continue as regular open.\n");
1653         }
1654
1655         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK))
1656                 GOTO(out, result = err_serious(-ENOMEM));
1657
1658         mdt_set_disposition(info, ldlm_rep,
1659                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1660
1661         lh = &info->mti_lh[MDT_LH_PARENT];
1662         mdt_lock_pdo_init(lh, (create_flags & MDS_OPEN_CREAT) ?
1663                           LCK_PW : LCK_PR, rr->rr_name, rr->rr_namelen);
1664
1665         parent = mdt_object_find_lock(info, rr->rr_fid1, lh,
1666                                       MDS_INODELOCK_UPDATE);
1667         if (IS_ERR(parent))
1668                 GOTO(out, result = PTR_ERR(parent));
1669
1670         /* get and check version of parent */
1671         result = mdt_version_get_check(info, parent, 0);
1672         if (result)
1673                 GOTO(out_parent, result);
1674
1675         fid_zero(child_fid);
1676
1677         lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen);
1678         result = mdo_lookup(info->mti_env, mdt_object_child(parent),
1679                             lname, child_fid, &info->mti_spec);
1680         LASSERTF(ergo(result == 0, fid_is_sane(child_fid)),
1681                  "looking for "DFID"/%s, result fid="DFID"\n",
1682                  PFID(mdt_object_fid(parent)), rr->rr_name, PFID(child_fid));
1683
1684         if (result != 0 && result != -ENOENT && result != -ESTALE)
1685                 GOTO(out_parent, result);
1686
1687         if (result == -ENOENT || result == -ESTALE) {
1688                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1689                 if (result == -ESTALE) {
1690                         /*
1691                          * -ESTALE means the parent is a dead(unlinked) dir, so
1692                          * it should return -ENOENT to in accordance with the
1693                          * original mds implementaion.
1694                          */
1695                         GOTO(out_parent, result = -ENOENT);
1696                 }
1697                 if (!(create_flags & MDS_OPEN_CREAT))
1698                         GOTO(out_parent, result);
1699                 *child_fid = *info->mti_rr.rr_fid2;
1700                 LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
1701                          PFID(child_fid));
1702                 /* In the function below, .hs_keycmp resolves to
1703                  * lu_obj_hop_keycmp() */
1704                 /* coverity[overrun-buffer-val] */
1705                 child = mdt_object_new(info->mti_env, mdt, child_fid);
1706         } else {
1707                 /*
1708                  * Check for O_EXCL is moved to the mdt_finish_open(), we need to
1709                  * return FID back in that case.
1710                  */
1711                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1712                 child = mdt_object_find(info->mti_env, mdt, child_fid);
1713         }
1714         if (IS_ERR(child))
1715                 GOTO(out_parent, result = PTR_ERR(child));
1716
1717         /** check version of child  */
1718         rc = mdt_version_get_check(info, child, 1);
1719         if (rc)
1720                 GOTO(out_child, result = rc);
1721
1722         mdt_set_capainfo(info, 1, child_fid, BYPASS_CAPA);
1723         if (result == -ENOENT) {
1724                 /* Create under OBF and .lustre is not permitted */
1725                 if (fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1))
1726                         GOTO(out_child, result = -EPERM);
1727
1728                 /* save versions in reply */
1729                 mdt_version_get_save(info, parent, 0);
1730                 mdt_version_get_save(info, child, 1);
1731
1732                 /* version of child will be changed */
1733                 info->mti_mos = child;
1734
1735                 /* Not found and with MDS_OPEN_CREAT: let's create it. */
1736                 mdt_set_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1737
1738                 /* Let lower layers know what is lock mode on directory. */
1739                 info->mti_spec.sp_cr_mode =
1740                         mdt_dlm_mode2mdl_mode(lh->mlh_pdo_mode);
1741
1742                 /*
1743                  * Do not perform lookup sanity check. We know that name does
1744                  * not exist.
1745                  */
1746                 info->mti_spec.sp_cr_lookup = 0;
1747                 info->mti_spec.sp_feat = &dt_directory_features;
1748
1749                 result = mdo_create(info->mti_env,
1750                                     mdt_object_child(parent),
1751                                     lname,
1752                                     mdt_object_child(child),
1753                                     &info->mti_spec,
1754                                     &info->mti_attr);
1755                 if (result == -ERESTART) {
1756                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1757                         GOTO(out_child, result);
1758                 } else {
1759
1760                         /* XXX: we should call this once, see few lines below */
1761                         if (result == 0)
1762                                 result = mdt_attr_get_complex(info, child, ma);
1763
1764                         if (result != 0)
1765                                 GOTO(out_child, result);
1766                 }
1767                 created = 1;
1768         } else {
1769                 /*
1770                  * The object is on remote node, return its FID for remote open.
1771                  */
1772                 if (mdt_object_remote(child)) {
1773                         /*
1774                          * Check if this lock already was sent to client and
1775                          * this is resent case. For resent case do not take lock
1776                          * again, use what is already granted.
1777                          */
1778                         LASSERT(lhc != NULL);
1779
1780                         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
1781                                 struct ldlm_lock *lock;
1782
1783                                 LASSERT(msg_flags & MSG_RESENT);
1784
1785                                 lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
1786                                 if (!lock) {
1787                                         CERROR("Invalid lock handle "LPX64"\n",
1788                                                lhc->mlh_reg_lh.cookie);
1789                                         LBUG();
1790                                 }
1791                                 LASSERT(fid_res_name_eq(mdt_object_fid(child),
1792                                                         &lock->l_resource->lr_name));
1793                                 LDLM_LOCK_PUT(lock);
1794                                 rc = 0;
1795                         } else {
1796                                 mdt_lock_handle_init(lhc);
1797                                 mdt_lock_reg_init(lhc, LCK_PR);
1798
1799                                 rc = mdt_object_lock(info, child, lhc,
1800                                                      MDS_INODELOCK_LOOKUP,
1801                                                      MDT_CROSS_LOCK);
1802                         }
1803                         repbody->fid1 = *mdt_object_fid(child);
1804                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1805                         if (rc != 0)
1806                                 result = rc;
1807                         else
1808                                 result = -EREMOTE;
1809                         GOTO(out_child, result);
1810                 } else {
1811                         if (mdt_object_exists(child)) {
1812                                 /* We have to get attr & LOV EA & HSM for this
1813                                  * object */
1814                                 ma->ma_need |= MA_HSM;
1815                                 result = mdt_attr_get_complex(info, child, ma);
1816                         } else {
1817                                 /*object non-exist!!!*/
1818                                 LBUG();
1819                         }
1820                 }
1821         }
1822
1823         LASSERT(!lustre_handle_is_used(&lhc->mlh_reg_lh));
1824
1825         /* get openlock if this is not replay and if a client requested it */
1826         if (!req_is_replay(req)) {
1827                 rc = mdt_object_open_lock(info, child, lhc, &ibits);
1828                 if (rc != 0)
1829                         GOTO(out_child_unlock, result = rc);
1830                 else if (create_flags & MDS_OPEN_LOCK)
1831                         mdt_set_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1832         }
1833
1834         /* Try to open it now. */
1835         rc = mdt_finish_open(info, parent, child, create_flags,
1836                              created, ldlm_rep);
1837         if (rc) {
1838                 result = rc;
1839                 /* openlock will be released if mdt_finish_open failed */
1840                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1841
1842                 if (created && create_flags & MDS_OPEN_VOLATILE) {
1843                         CERROR("%s: cannot open volatile file "DFID", orphan "
1844                                "file will be left in PENDING directory until "
1845                                "next reboot, rc = %d\n", mdt_obd_name(mdt),
1846                                PFID(mdt_object_fid(child)), rc);
1847                         GOTO(out_child_unlock, result);
1848                 }
1849
1850                 if (created) {
1851                         ma->ma_need = 0;
1852                         ma->ma_valid = 0;
1853                         ma->ma_cookie_size = 0;
1854                         rc = mdo_unlink(info->mti_env,
1855                                         mdt_object_child(parent),
1856                                         mdt_object_child(child),
1857                                         lname,
1858                                         &info->mti_attr, 0);
1859                         if (rc != 0)
1860                                 CERROR("%s: "DFID" cleanup of open: rc = %d\n",
1861                                        mdt_obd_name(info->mti_mdt),
1862                                        PFID(mdt_object_fid(child)), rc);
1863                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1864                 }
1865         }
1866         EXIT;
1867 out_child_unlock:
1868         mdt_object_open_unlock(info, child, lhc, ibits, result);
1869 out_child:
1870         mdt_object_put(info->mti_env, child);
1871 out_parent:
1872         mdt_object_unlock_put(info, parent, lh, result || !created);
1873 out:
1874         if (result)
1875                 lustre_msg_set_transno(req->rq_repmsg, 0);
1876         return result;
1877 }
1878
1879 /**
1880  * Create an orphan object use local root.
1881  */
1882 static struct mdt_object *mdt_orphan_open(struct mdt_thread_info *info,
1883                                           struct mdt_device *mdt,
1884                                           const struct lu_fid *fid,
1885                                           struct md_attr *attr, fmode_t fmode)
1886 {
1887         const struct lu_env *env = info->mti_env;
1888         struct md_op_spec *spec = &info->mti_spec;
1889         struct lu_fid *rootfid = &info->mti_tmp_fid1;
1890         struct mdt_object *obj = NULL;
1891         struct mdt_object *local_root;
1892         static const char name[] = "i_am_nobody";
1893         struct lu_name *lname;
1894         int rc;
1895         ENTRY;
1896
1897         rc = dt_root_get(env, mdt->mdt_bottom, rootfid);
1898         if (rc != 0)
1899                 RETURN(ERR_PTR(rc));
1900
1901         local_root = mdt_object_find(env, mdt, rootfid);
1902         if (IS_ERR(local_root))
1903                 RETURN(local_root);
1904
1905         obj = mdt_object_new(env, mdt, fid);
1906         if (IS_ERR(obj))
1907                 GOTO(out, rc = PTR_ERR(obj));
1908
1909         spec->sp_cr_lookup = 0;
1910         spec->sp_feat = &dt_directory_features;
1911         spec->sp_cr_mode = MDL_MINMODE; /* no lock */
1912         spec->sp_cr_flags = MDS_OPEN_VOLATILE | fmode;
1913         if (attr->ma_valid & MA_LOV) {
1914                 spec->u.sp_ea.eadata = attr->ma_lmm;
1915                 spec->u.sp_ea.eadatalen = attr->ma_lmm_size;
1916                 spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
1917         } else {
1918                 spec->sp_cr_flags |= MDS_OPEN_DELAY_CREATE;
1919         }
1920
1921         lname = mdt_name(env, (char *)name, sizeof(name) - 1);
1922         rc = mdo_create(env, mdt_object_child(local_root), lname,
1923                         mdt_object_child(obj), spec, attr);
1924         if (rc == 0) {
1925                 rc = mo_open(env, mdt_object_child(obj), MDS_OPEN_CREATED);
1926                 if (rc < 0)
1927                         CERROR("%s: cannot open volatile file "DFID", orphan "
1928                                "file will be left in PENDING directory until "
1929                                "next reboot, rc = %d\n", mdt_obd_name(mdt),
1930                                PFID(fid), rc);
1931         }
1932         EXIT;
1933
1934 out:
1935         if (rc < 0) {
1936                 if (!IS_ERR(obj))
1937                         mdt_object_put(env, obj);
1938                 obj = ERR_PTR(rc);
1939         }
1940         mdt_object_put(env, local_root);
1941         return obj;
1942 }
1943
1944 static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o,
1945                            struct md_attr *ma)
1946 {
1947         struct mdt_lock_handle *lh = &info->mti_lh[MDT_LH_LAYOUT];
1948         struct close_data      *data;
1949         struct ldlm_lock       *lease;
1950         struct mdt_object      *orphan;
1951         struct md_attr         *orp_ma;
1952         struct lu_buf          *buf;
1953         bool                    lease_broken;
1954         int                     rc;
1955         int                     rc2;
1956         ENTRY;
1957
1958         data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
1959         if (data == NULL)
1960                 RETURN(-EPROTO);
1961
1962         lease = ldlm_handle2lock(&data->cd_handle);
1963         if (lease == NULL)
1964                 RETURN(-ESTALE);
1965
1966         /* try to hold open_sem so that nobody else can open the file */
1967         if (!down_write_trylock(&o->mot_open_sem)) {
1968                 ldlm_lock_cancel(lease);
1969                 LDLM_LOCK_PUT(lease);
1970                 RETURN(-EBUSY);
1971         }
1972
1973         /* Check if the lease open lease has already canceled */
1974         lock_res_and_lock(lease);
1975         lease_broken = ldlm_is_cancel(lease);
1976         unlock_res_and_lock(lease);
1977
1978         LDLM_DEBUG(lease, DFID " lease broken? %d\n",
1979                    PFID(mdt_object_fid(o)), lease_broken);
1980
1981         /* Cancel server side lease. Client side counterpart should
1982          * have been cancelled. It's okay to cancel it now as we've
1983          * held mot_open_sem. */
1984         ldlm_lock_cancel(lease);
1985         LDLM_LOCK_PUT(lease);
1986
1987         if (lease_broken) /* don't perform release task */
1988                 GOTO(out_unlock, rc = -ESTALE);
1989
1990         if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid))
1991                 GOTO(out_unlock, rc = -EINVAL);
1992
1993         /* ma_need was set before but it seems fine to change it in order to
1994          * avoid modifying the one from RPC */
1995         ma->ma_need = MA_HSM | MA_LOV;
1996         rc = mdt_attr_get_complex(info, o, ma);
1997         if (rc != 0)
1998                 GOTO(out_unlock, rc);
1999
2000         if (!mdt_hsm_release_allow(ma))
2001                 GOTO(out_unlock, rc = -EPERM);
2002
2003         /* already released? */
2004         if (ma->ma_hsm.mh_flags & HS_RELEASED)
2005                 GOTO(out_unlock, rc = 0);
2006
2007         /* Compare on-disk and packed data_version */
2008         if (data->cd_data_version != ma->ma_hsm.mh_arch_ver) {
2009                 CDEBUG(D_HSM, DFID" data_version mismatches: packed="LPU64
2010                        " and on-disk="LPU64"\n", PFID(mdt_object_fid(o)),
2011                        data->cd_data_version, ma->ma_hsm.mh_arch_ver);
2012                 /* XXX: Enable this line when hsm_archive is operational!
2013                 GOTO(out_unlock, rc = -EPERM);
2014                 */
2015         }
2016
2017         ma->ma_valid = MA_INODE;
2018         ma->ma_attr.la_valid &= LA_SIZE | LA_MTIME | LA_ATIME;
2019         rc = mo_attr_set(info->mti_env, mdt_object_child(o), ma);
2020         if (rc < 0)
2021                 GOTO(out_unlock, rc);
2022
2023         if (!(ma->ma_valid & MA_LOV)) {
2024                 /* Even empty file are released */
2025                 memset(ma->ma_lmm, 0, sizeof(*ma->ma_lmm));
2026                 ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
2027                 ma->ma_lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0);
2028                 ma->ma_lmm->lmm_stripe_size = cpu_to_le32(LOV_MIN_STRIPE_SIZE);
2029                 ma->ma_valid |= MA_LOV;
2030         } else {
2031                 /* Magic must be LOV_MAGIC_Vx_DEF otherwise LOD will interpret
2032                  * ma_lmm as lov_user_md, then it will be confused by union of
2033                  * layout_gen and stripe_offset. */
2034                 if (le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1)
2035                         ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
2036                 else if (le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3)
2037                         ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V3_DEF);
2038                 else
2039                         GOTO(out_unlock, rc = -EINVAL);
2040         }
2041
2042         /* Set file as released */
2043         ma->ma_lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_RELEASED);
2044
2045         /* Hopefully it's not used in this call path */
2046         orp_ma = &info->mti_u.som.attr;
2047         orp_ma->ma_valid = MA_INODE | MA_LOV;
2048         orp_ma->ma_attr.la_mode = S_IFREG;
2049         orp_ma->ma_attr.la_valid = LA_MODE;
2050         orp_ma->ma_lmm = ma->ma_lmm;
2051         orp_ma->ma_lmm_size = ma->ma_lmm_size;
2052         orphan = mdt_orphan_open(info, info->mti_mdt, &data->cd_fid, orp_ma,
2053                                  FMODE_WRITE);
2054         if (IS_ERR(orphan)) {
2055                 CERROR("%s: cannot open orphan file "DFID": rc = %ld\n",
2056                        mdt_obd_name(info->mti_mdt), PFID(&data->cd_fid),
2057                        PTR_ERR(orphan));
2058                 GOTO(out_unlock, rc = PTR_ERR(orphan));
2059         }
2060
2061         /* Set up HSM attribute for orphan object */
2062         CLASSERT(sizeof(struct hsm_attrs) <= sizeof(info->mti_xattr_buf));
2063         buf = &info->mti_buf;
2064         buf->lb_buf = info->mti_xattr_buf;
2065         buf->lb_len = sizeof(struct hsm_attrs);
2066         ma->ma_hsm.mh_flags |= HS_RELEASED;
2067         lustre_hsm2buf(buf->lb_buf, &ma->ma_hsm);
2068         ma->ma_hsm.mh_flags &= ~HS_RELEASED;
2069         rc = mo_xattr_set(info->mti_env, mdt_object_child(orphan), buf,
2070                           XATTR_NAME_HSM, 0);
2071         if (rc < 0)
2072                 GOTO(out_close, rc);
2073
2074         mdt_lock_reg_init(lh, LCK_EX);
2075         rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_LAYOUT, MDT_LOCAL_LOCK);
2076         if (rc == 0) {
2077                 /* Swap layout with orphan object */
2078                 rc = mo_swap_layouts(info->mti_env, mdt_object_child(o),
2079                                      mdt_object_child(orphan),
2080                                      SWAP_LAYOUTS_MDS_HSM);
2081
2082                 /* Release exclusive LL */
2083                 mdt_object_unlock(info, o, lh, 1);
2084         }
2085         EXIT;
2086
2087 out_close:
2088         /* Close orphan object anyway */
2089         rc2 = mo_close(info->mti_env, mdt_object_child(orphan), orp_ma,
2090                        FMODE_WRITE);
2091         if (rc2 < 0)
2092                 CERROR("%s: error closing volatile file "DFID": rc = %d\n",
2093                        mdt_obd_name(info->mti_mdt), PFID(&data->cd_fid), rc2);
2094         LU_OBJECT_DEBUG(D_HSM, info->mti_env, &orphan->mot_obj,
2095                         "object closed\n");
2096         mdt_object_put(info->mti_env, orphan);
2097
2098 out_unlock:
2099         up_write(&o->mot_open_sem);
2100
2101         if (rc == 0) { /* already released */
2102                 struct mdt_body *repbody;
2103                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
2104                 LASSERT(repbody != NULL);
2105                 repbody->valid |= OBD_MD_FLRELEASED;
2106         }
2107
2108         ma->ma_valid = 0;
2109         ma->ma_need = 0;
2110         return rc;
2111 }
2112
2113 #define MFD_CLOSED(mode) (((mode) & ~(MDS_FMODE_EPOCH | MDS_FMODE_SOM | \
2114                                       MDS_FMODE_TRUNC)) == MDS_FMODE_CLOSED)
2115
2116 static int mdt_mfd_closed(struct mdt_file_data *mfd)
2117 {
2118         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
2119 }
2120
2121 int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
2122 {
2123         struct mdt_object *o = mfd->mfd_object;
2124         struct md_object *next = mdt_object_child(o);
2125         struct md_attr *ma = &info->mti_attr;
2126         int ret = MDT_IOEPOCH_CLOSED;
2127         int rc = 0;
2128         __u64 mode;
2129         ENTRY;
2130
2131         mode = mfd->mfd_mode;
2132
2133         if (ma->ma_attr_flags & MDS_HSM_RELEASE) {
2134                 rc = mdt_hsm_release(info, o, ma);
2135                 if (rc < 0) {
2136                         CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
2137                                 mdt_obd_name(info->mti_mdt),
2138                                 PFID(mdt_object_fid(o)), rc);
2139                         /* continue to close even error occurred. */
2140                 }
2141         }
2142
2143         if ((mode & FMODE_WRITE) || (mode & MDS_FMODE_TRUNC)) {
2144                 mdt_write_put(o);
2145                 ret = mdt_ioepoch_close(info, o);
2146         } else if (mode & MDS_FMODE_EXEC) {
2147                 mdt_write_allow(o);
2148         } else if (mode & MDS_FMODE_EPOCH) {
2149                 ret = mdt_ioepoch_close(info, o);
2150         } else if (mode & MDS_FMODE_SOM) {
2151                 ret = mdt_som_au_close(info, o);
2152         }
2153
2154         /* Update atime on close only. */
2155         if ((mode & MDS_FMODE_EXEC || mode & FMODE_READ || mode & FMODE_WRITE)
2156             && (ma->ma_valid & MA_INODE) && (ma->ma_attr.la_valid & LA_ATIME)) {
2157                 /* Set the atime only. */
2158                 ma->ma_valid = MA_INODE;
2159                 ma->ma_attr.la_valid = LA_ATIME;
2160                 rc = mo_attr_set(info->mti_env, next, ma);
2161         }
2162
2163         /* If file data is modified, add the dirty flag. */
2164         if (ma->ma_attr_flags & MDS_DATA_MODIFIED)
2165                 rc = mdt_add_dirty_flag(info, o, ma);
2166
2167         ma->ma_need |= MA_INODE;
2168         ma->ma_valid &= ~MA_INODE;
2169
2170         if (!MFD_CLOSED(mode))
2171                 rc = mo_close(info->mti_env, next, ma, mode);
2172
2173         if (ret == MDT_IOEPOCH_GETATTR || ret == MDT_IOEPOCH_OPENED) {
2174                 struct mdt_export_data *med;
2175
2176                 /* The IOepoch is still opened or SOM update is needed.
2177                  * Put mfd back into the list. */
2178                 LASSERT(mdt_conn_flags(info) & OBD_CONNECT_SOM);
2179                 mdt_mfd_set_mode(mfd, ret == MDT_IOEPOCH_OPENED ?
2180                                       MDS_FMODE_EPOCH : MDS_FMODE_SOM);
2181
2182                 LASSERT(mdt_info_req(info));
2183                 med = &mdt_info_req(info)->rq_export->exp_mdt_data;
2184                 spin_lock(&med->med_open_lock);
2185                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
2186                 class_handle_hash_back(&mfd->mfd_handle);
2187                 spin_unlock(&med->med_open_lock);
2188
2189                 if (ret == MDT_IOEPOCH_OPENED) {
2190                         ret = 0;
2191                 } else {
2192                         ret = -EAGAIN;
2193                         CDEBUG(D_INODE, "Size-on-MDS attribute update is "
2194                                "needed on "DFID"\n", PFID(mdt_object_fid(o)));
2195                 }
2196         } else {
2197                 /* adjust open and lease count */
2198                 if (mode & MDS_OPEN_LEASE) {
2199                         LASSERT(atomic_read(&o->mot_lease_count) > 0);
2200                         atomic_dec(&o->mot_lease_count);
2201                 }
2202                 LASSERT(atomic_read(&o->mot_open_count) > 0);
2203                 atomic_dec(&o->mot_open_count);
2204
2205                 mdt_mfd_free(mfd);
2206                 mdt_object_put(info->mti_env, o);
2207         }
2208
2209         RETURN(rc ? rc : ret);
2210 }
2211
2212 int mdt_close(struct mdt_thread_info *info)
2213 {
2214         struct mdt_export_data *med;
2215         struct mdt_file_data   *mfd;
2216         struct mdt_object      *o;
2217         struct md_attr         *ma = &info->mti_attr;
2218         struct mdt_body        *repbody = NULL;
2219         struct ptlrpc_request  *req = mdt_info_req(info);
2220         int rc, ret = 0;
2221         ENTRY;
2222
2223         mdt_counter_incr(req, LPROC_MDT_CLOSE);
2224         /* Close may come with the Size-on-MDS update. Unpack it. */
2225         rc = mdt_close_unpack(info);
2226         if (rc)
2227                 RETURN(err_serious(rc));
2228
2229         LASSERT(info->mti_ioepoch);
2230
2231         req_capsule_set_size(info->mti_pill, &RMF_MDT_MD, RCL_SERVER,
2232                              info->mti_mdt->mdt_max_mdsize);
2233         req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER,
2234                              info->mti_mdt->mdt_max_cookiesize);
2235         rc = req_capsule_server_pack(info->mti_pill);
2236         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
2237                 mdt_client_compatibility(info);
2238                 if (rc == 0)
2239                         mdt_fix_reply(info);
2240                 mdt_exit_ucred(info);
2241                 RETURN(lustre_msg_get_status(req->rq_repmsg));
2242         }
2243
2244         /* Continue to close handle even if we can not pack reply */
2245         if (rc == 0) {
2246                 repbody = req_capsule_server_get(info->mti_pill,
2247                                                  &RMF_MDT_BODY);
2248                 ma->ma_lmm = req_capsule_server_get(info->mti_pill,
2249                                                     &RMF_MDT_MD);
2250                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
2251                                                        &RMF_MDT_MD,
2252                                                        RCL_SERVER);
2253                 ma->ma_cookie = req_capsule_server_get(info->mti_pill,
2254                                                        &RMF_LOGCOOKIES);
2255                 ma->ma_cookie_size = req_capsule_get_size(info->mti_pill,
2256                                                           &RMF_LOGCOOKIES,
2257                                                           RCL_SERVER);
2258                 ma->ma_need = MA_INODE | MA_LOV | MA_COOKIE;
2259                 repbody->eadatasize = 0;
2260                 repbody->aclsize = 0;
2261         } else {
2262                 rc = err_serious(rc);
2263         }
2264
2265         med = &req->rq_export->exp_mdt_data;
2266         spin_lock(&med->med_open_lock);
2267         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2268                              req_is_replay(req));
2269         if (mdt_mfd_closed(mfd)) {
2270                 spin_unlock(&med->med_open_lock);
2271                 CDEBUG(D_INODE, "no handle for file close: fid = "DFID
2272                        ": cookie = "LPX64"\n", PFID(info->mti_rr.rr_fid1),
2273                        info->mti_ioepoch->handle.cookie);
2274                 /** not serious error since bug 3633 */
2275                 rc = -ESTALE;
2276         } else {
2277                 class_handle_unhash(&mfd->mfd_handle);
2278                 cfs_list_del_init(&mfd->mfd_list);
2279                 spin_unlock(&med->med_open_lock);
2280
2281                 /* Do not lose object before last unlink. */
2282                 o = mfd->mfd_object;
2283                 mdt_object_get(info->mti_env, o);
2284                 ret = mdt_mfd_close(info, mfd);
2285                 if (repbody != NULL)
2286                         rc = mdt_handle_last_unlink(info, o, ma);
2287                 mdt_empty_transno(info, rc);
2288                 mdt_object_put(info->mti_env, o);
2289         }
2290         if (repbody != NULL) {
2291                 mdt_client_compatibility(info);
2292                 rc = mdt_fix_reply(info);
2293         }
2294
2295         mdt_exit_ucred(info);
2296         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK))
2297                 RETURN(err_serious(-ENOMEM));
2298
2299         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET_REP,
2300                                  OBD_FAIL_MDS_CLOSE_NET_REP))
2301                 info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET_REP;
2302         RETURN(rc ? rc : ret);
2303 }
2304
2305 /**
2306  * DONE_WRITING rpc handler.
2307  *
2308  * As mfd is not kept after replayed CLOSE (see mdt_ioepoch_close_on_replay()),
2309  * only those DONE_WRITING rpc will be replayed which really wrote smth on disk,
2310  * and got a trasid. Waiting for such DONE_WRITING is not reliable, so just
2311  * skip attributes and reconstruct the reply here.
2312  */
2313 int mdt_done_writing(struct mdt_thread_info *info)
2314 {
2315         struct ptlrpc_request   *req = mdt_info_req(info);
2316         struct mdt_body         *repbody = NULL;
2317         struct mdt_export_data  *med;
2318         struct mdt_file_data    *mfd;
2319         int rc;
2320         ENTRY;
2321
2322         rc = req_capsule_server_pack(info->mti_pill);
2323         if (rc)
2324                 RETURN(err_serious(rc));
2325
2326         repbody = req_capsule_server_get(info->mti_pill,
2327                                          &RMF_MDT_BODY);
2328         repbody->eadatasize = 0;
2329         repbody->aclsize = 0;
2330
2331         /* Done Writing may come with the Size-on-MDS update. Unpack it. */
2332         rc = mdt_close_unpack(info);
2333         if (rc)
2334                 RETURN(err_serious(rc));
2335
2336         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
2337                 mdt_exit_ucred(info);
2338                 RETURN(lustre_msg_get_status(req->rq_repmsg));
2339         }
2340
2341         med = &info->mti_exp->exp_mdt_data;
2342         spin_lock(&med->med_open_lock);
2343         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2344                              req_is_replay(req));
2345         if (mfd == NULL) {
2346                 spin_unlock(&med->med_open_lock);
2347                 CDEBUG(D_INODE, "no handle for done write: fid = "DFID
2348                        ": cookie = "LPX64" ioepoch = "LPU64"\n",
2349                        PFID(info->mti_rr.rr_fid1),
2350                        info->mti_ioepoch->handle.cookie,
2351                        info->mti_ioepoch->ioepoch);
2352                 /* If this is a replay, reconstruct the transno. */
2353                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2354                         rc = info->mti_ioepoch->flags & MF_SOM_AU ?
2355                              -EAGAIN : 0;
2356                         mdt_empty_transno(info, rc);
2357                 } else
2358                         rc = -ESTALE;
2359                 GOTO(error_ucred, rc);
2360         }
2361
2362         LASSERT(mfd->mfd_mode == MDS_FMODE_EPOCH ||
2363                 mfd->mfd_mode == MDS_FMODE_TRUNC);
2364         class_handle_unhash(&mfd->mfd_handle);
2365         cfs_list_del_init(&mfd->mfd_list);
2366         spin_unlock(&med->med_open_lock);
2367
2368         /* Set EPOCH CLOSE flag if not set by client. */
2369         info->mti_ioepoch->flags |= MF_EPOCH_CLOSE;
2370         info->mti_attr.ma_valid = 0;
2371
2372         info->mti_attr.ma_lmm_size = info->mti_mdt->mdt_max_mdsize;
2373         OBD_ALLOC_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2374         if (info->mti_attr.ma_lmm == NULL)
2375                 GOTO(error_ucred, rc = -ENOMEM);
2376
2377         rc = mdt_mfd_close(info, mfd);
2378
2379         OBD_FREE_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2380         mdt_empty_transno(info, rc);
2381 error_ucred:
2382         mdt_exit_ucred(info);
2383         RETURN(rc);
2384 }