- change init script to fail more clearly if not run as root (1528)
- fix ns_lock/i_sem lock ordering deadlock for kms update (3477)
- don't do DNS lookups on NIDs too small for IP addresses (3442)
+ - dynamic ptlrpc request buffer allocation (2102)
+ - don't allow unlinking open directory if it isn't empty (2904)
+ - set MDS/OST threads to umask 0 to not clobber client modes (3359)
* miscellania
- servers can dump a log evicting a client - lustre.dump_on_timeout=1
AC_INIT
AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE(lustre, 1.2.2.2)
+AM_INIT_AUTOMAKE(lustre, 1.2.2.3)
# AM_MAINTAINER_MODE
# Four main targets: lustre kernel modules, utilities, tests, and liblustre
#define ur_cap ur_uc.ouc_cap
#define ur_suppgid1 ur_uc.ouc_suppgid1
#define ur_suppgid2 ur_uc.ouc_suppgid2
+#define ur_umask ur_uc.ouc_umask
/* i_attr_flags holds the open count in the inode in 2.4 */
//XXX Alex implement on 2.4 with i_attr_flags and find soln for 2.5 please
* buffers */
#define SVC_BUF_VMALLOC_THRESHOLD (2*PAGE_SIZE)
-/* The following constants determine how much memory is devoted to
- * buffering in the lustre services.
+/* The following constants determine how memory is used to buffer incoming
+ * service requests.
*
- * ?_NEVENTS # event queue entries
- *
- * ?_NBUFS # request buffers
+ * ?_NBUFS # buffers to allocate when growing the pool
* ?_BUFSIZE # bytes in a single request buffer
- * total memory = ?_NBUFS * ?_BUFSIZE
- *
* ?_MAXREQSIZE # maximum request service will receive
- * messages larger than ?_MAXREQSIZE are dropped.
- * request buffers are auto-unlinked when less than ?_MAXREQSIZE
- * is left in them.
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
*/
#define LDLM_NUM_THREADS min(smp_num_cpus * smp_num_cpus * 8, 64)
-#define LDLM_NBUF_MAX 512UL
+#define LDLM_NBUFS 64
#define LDLM_BUFSIZE (8 * 1024)
#define LDLM_MAXREQSIZE (5 * 1024)
-#define LDLM_MAXMEM (num_physpages*(PAGE_SIZE/1024))
-#define LDLM_NBUFS min(LDLM_MAXMEM/LDLM_BUFSIZE, LDLM_NBUF_MAX)
#define MDT_MAX_THREADS 32UL
#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
MDT_MAX_THREADS), 2UL)
-#define MDS_NBUF_MAX 4096UL
+#define MDS_NBUFS 64
#define MDS_BUFSIZE (8 * 1024)
/* Assume file name length = FNAME_MAX = 256 (true for extN).
* path name length = PATH_MAX = 4096
* except in the open case where there are a large number of OSTs in a LOV.
*/
#define MDS_MAXREQSIZE (5 * 1024)
-#define MDS_MAXMEM (num_physpages*(PAGE_SIZE/128))
-#define MDS_NBUFS min(MDS_MAXMEM/MDS_BUFSIZE, MDS_NBUF_MAX)
#define OST_MAX_THREADS 36UL
#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
OST_MAX_THREADS), 2UL)
-#define OST_NBUF_MAX 5000UL
+#define OST_NBUFS 64
#define OST_BUFSIZE (8 * 1024)
/* OST_MAXREQSIZE ~= 1640 bytes =
* lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
* - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
*/
#define OST_MAXREQSIZE (5 * 1024)
-#define OST_MAXMEM (num_physpages*(PAGE_SIZE/128))
-#define OST_NBUFS min(OST_MAXMEM/OST_BUFSIZE, OST_NBUF_MAX)
#define PTLBD_NUM_THREADS 4
-#define PTLBD_NBUFS 20
+#define PTLBD_NBUFS 64
#define PTLBD_BUFSIZE (32 * 1024)
#define PTLBD_MAXREQSIZE 1024
struct list_head srv_list; /* chain thru all services */
int srv_max_req_size; /* biggest request to receive */
int srv_buf_size; /* size of individual buffers */
+ int srv_nbuf_per_group; /* # buffers to allocate in 1 group */
int srv_nbufs; /* total # req buffer descs allocated */
int srv_nthreads; /* # running threads */
int srv_n_difficult_replies; /* # 'difficult' replies */
__u32 ouc_cap;
__u32 ouc_suppgid1;
__u32 ouc_suppgid2;
+ __u32 ouc_umask;
};
struct lvfs_callback_ops {
}
-/* Merge rss if kms == 0
+/* Merge rss if @kms_only == 0
*
* Even when merging RSS, we will take the KMS value if it's larger.
* This prevents getattr from stomping on dirty cached pages which
i++, loi++) {
obd_size lov_size, tmpsize;
- CDEBUG(D_DLMTRACE, "stripe %u ost %u kms "LPU64" rss "LPU64"\n",
- i, loi->loi_ost_idx, loi->loi_kms, loi->loi_rss);
tmpsize = loi->loi_kms;
if (kms_only == 0 && loi->loi_rss > tmpsize)
tmpsize = loi->loi_rss;
save->ouc.ouc_cap = current->cap_effective;
save->ouc.ouc_suppgid1 = current_groups[0];
save->ouc.ouc_suppgid2 = current_groups[1];
+ save->ouc.ouc_umask = current->fs->umask;
current->fsuid = uc->ouc_fsuid;
current->fsgid = uc->ouc_fsgid;
current->cap_effective = uc->ouc_cap;
current_ngroups = 0;
+ current->fs->umask = 0; /* umask already applied on client */
if (uc->ouc_suppgid1 != -1)
current_groups[current_ngroups++] = uc->ouc_suppgid1;
current_ngroups = saved->ngroups;
current_groups[0] = saved->ouc.ouc_suppgid1;
current_groups[1] = saved->ouc.ouc_suppgid2;
+ current->fs->umask = saved->ouc.ouc_umask;
}
/*
if (rc)
CERROR("error on parent setattr: rc = %d\n", rc);
- rc = mds_finish_transno(mds, dchild->d_inode, handle, req, 0,
- rep ? rep->lock_policy_res1 : 0);
+ rc = fsfilt_commit(obd, dchild->d_inode, handle, 0);
handle = NULL;
acc_mode = 0; /* Don't check for permissions */
}
cleanup_phase = 1; /* dchild, dparent, locks */
+ dget(dchild);
child_inode = dchild->d_inode;
if (child_inode == NULL) {
CDEBUG(D_INODE, "child doesn't exist (dir %lu, name %s)\n",
GOTO(cleanup, rc = -EISDIR);
}
- if (child_inode->i_nlink == (S_ISDIR(child_inode->i_mode) ? 2 : 1) &&
- mds_open_orphan_count(child_inode) > 0) {
- rc = mds_open_unlink_rename(rec, obd, dparent, dchild, &handle);
- cleanup_phase = 4; /* transaction */
- GOTO(cleanup, rc);
- }
-
/* Step 4: Do the unlink: we already verified ur_mode above (bug 72) */
switch (child_inode->i_mode & S_IFMT) {
case S_IFDIR:
switch(cleanup_phase) {
case 4:
+ LASSERT(dchild != NULL && dchild->d_inode != NULL);
+ LASSERT(atomic_read(&dchild->d_inode->i_count) > 0);
+ if (rc == 0 && dchild->d_inode->i_nlink == 0 &&
+ mds_open_orphan_count(dchild->d_inode) > 0) {
+ /* filesystem is really going to destroy an inode
+ * we have to delay this till inode is opened -bzzz */
+ mds_open_unlink_rename(rec, obd, dparent, dchild, NULL);
+ }
rc = mds_finish_transno(mds, dparent->d_inode, handle, req,
rc, 0);
if (!rc)
else
ptlrpc_save_lock(req, &parent_lockh, LCK_PW);
l_dput(dchild);
+ l_dput(dchild);
l_dput(dparent);
case 0:
break;
struct dentry *pending_child;
char fidname[LL_FID_NAMELEN];
int fidlen = 0, rc;
+ unsigned mode;
ENTRY;
LASSERT(!mds_inode_is_orphan(dchild->d_inode));
GOTO(out_dput, rc = 0);
}
- *handle = fsfilt_start(obd, pending_dir, FSFILT_OP_RENAME, NULL);
- if (IS_ERR(*handle))
- GOTO(out_dput, rc = PTR_ERR(*handle));
-
- lock_kernel();
- rc = vfs_rename(dparent->d_inode, dchild, pending_dir, pending_child);
- unlock_kernel();
+ /* link() is semanticaly-wrong for S_IFDIR, so we set S_IFREG
+ * for linking and return real mode back then -bzzz */
+ mode = dchild->d_inode->i_mode;
+ dchild->d_inode->i_mode = S_IFREG;
+ rc = vfs_link(dchild, pending_dir, pending_child);
if (rc)
- CERROR("error renaming orphan %lu/%s to PENDING: rc = %d\n",
- dparent->d_inode->i_ino, rec->ur_name, rc);
+ CERROR("error linking orphan %s to PENDING: rc = %d\n",
+ rec->ur_name, rc);
else
mds_inode_set_orphan(dchild->d_inode);
+
+ /* return mode and correct i_nlink if inode is directory */
+ LASSERT(dchild->d_inode->i_nlink == 1);
+ dchild->d_inode->i_mode = mode;
+ if ((mode & S_IFMT) == S_IFDIR) {
+ dchild->d_inode->i_nlink++;
+ pending_dir->i_nlink++;
+ }
+ mark_inode_dirty(dchild->d_inode);
+
out_dput:
dput(pending_child);
out_lock:
struct proc_dir_entry *svc_procroot;
struct lprocfs_stats *svc_stats;
int i, rc;
- unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+ unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
LPROCFS_CNTR_STDDEV;
LASSERT(*procroot_ret == NULL);
svc_counter_config, "req_qdepth", "reqs");
lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
svc_counter_config, "req_active", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+ svc_counter_config, "reqbuf_avail", "bufs");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_TOTAL_CNTR,
+ svc_counter_config, "reqbuf_total", "bufs");
for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
__u32 opcode = ll_rpc_opcode_table[i].opcode;
lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
PTLRPC_REQWAIT_CNTR = 0,
PTLRPC_REQQDEPTH_CNTR,
PTLRPC_REQACTIVE_CNTR,
+ PTLRPC_REQBUF_AVAIL_CNTR,
+ PTLRPC_REQBUF_TOTAL_CNTR,
PTLRPC_LAST_CNTR
};
#include <portals/types.h>
#include "ptlrpc_internal.h"
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
+
static LIST_HEAD (ptlrpc_all_services);
static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
OBD_FREE (rqbd, sizeof (*rqbd));
}
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_srv_ni *srv_ni)
+{
+ struct ptlrpc_service *svc = srv_ni->sni_service;
+ struct ptlrpc_request_buffer_desc *rqbd;
+ int i;
+
+ for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+ rqbd = ptlrpc_alloc_rqbd(srv_ni);
+
+ if (rqbd == NULL) {
+ CERROR ("%s/%s: Can't allocate request buffer\n",
+ svc->srv_name, srv_ni->sni_ni->pni_name);
+ return (-ENOMEM);
+ }
+
+ if (ptlrpc_server_post_idle_rqbds(svc) < 0)
+ return (-EAGAIN);
+ }
+
+ return (0);
+}
+
void
ptlrpc_save_lock (struct ptlrpc_request *req,
struct lustre_handle *lock, int mode)
struct ptlrpc_request_buffer_desc *rqbd;
unsigned long flags;
int rc;
+ int posted = 0;
- spin_lock_irqsave(&svc->srv_lock, flags);
- if (list_empty (&svc->srv_idle_rqbds)) {
- spin_unlock_irqrestore(&svc->srv_lock, flags);
- return (0);
- }
+ for (;;) {
+ spin_lock_irqsave(&svc->srv_lock, flags);
- rqbd = list_entry(svc->srv_idle_rqbds.next,
- struct ptlrpc_request_buffer_desc,
- rqbd_list);
- list_del (&rqbd->rqbd_list);
+ if (list_empty (&svc->srv_idle_rqbds)) {
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+ return (posted);
+ }
- /* assume we will post successfully */
- srv_ni = rqbd->rqbd_srv_ni;
- srv_ni->sni_nrqbd_receiving++;
- list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
+ rqbd = list_entry(svc->srv_idle_rqbds.next,
+ struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+ list_del (&rqbd->rqbd_list);
- spin_unlock_irqrestore(&svc->srv_lock, flags);
+ /* assume we will post successfully */
+ srv_ni = rqbd->rqbd_srv_ni;
+ srv_ni->sni_nrqbd_receiving++;
+ list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
- rc = ptlrpc_register_rqbd(rqbd);
- if (rc == 0)
- return (1);
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ rc = ptlrpc_register_rqbd(rqbd);
+ if (rc != 0)
+ break;
+
+ posted = 1;
+ }
spin_lock_irqsave(&svc->srv_lock, flags);
struct proc_dir_entry *proc_entry)
{
int i;
- int j;
+ int rc;
int ssize;
struct ptlrpc_service *service;
struct ptlrpc_srv_ni *srv_ni;
- struct ptlrpc_request_buffer_desc *rqbd;
ENTRY;
LASSERT (ptlrpc_ninterfaces > 0);
INIT_LIST_HEAD(&service->srv_threads);
init_waitqueue_head(&service->srv_waitq);
+ service->srv_nbuf_per_group = nbufs;
service->srv_max_req_size = max_req_size;
service->srv_buf_size = bufsize;
service->srv_rep_portal = rep_portal;
CDEBUG (D_NET, "%s: initialising interface %s\n", name,
srv_ni->sni_ni->pni_name);
- for (j = 0; j < nbufs; j++) {
- rqbd = ptlrpc_alloc_rqbd (srv_ni);
-
- if (rqbd == NULL) {
- CERROR ("%s.%d: Can't allocate request %d "
- "on %s\n", name, i, j,
- srv_ni->sni_ni->pni_name);
- GOTO(failed, NULL);
- }
-
- /* We shouldn't be under memory pressure at
- * startup, so fail if we can't post all our
- * buffers at this time. */
- if (ptlrpc_server_post_idle_rqbds(service) <= 0)
- GOTO(failed, NULL);
- }
+ rc = ptlrpc_grow_req_bufs(srv_ni);
+ /* We shouldn't be under memory pressure at startup, so
+ * fail if we can't post all our buffers at this time. */
+ if (rc != 0)
+ GOTO(failed, NULL);
}
if (proc_entry != NULL)
reparent_to_init();
}
+static void
+ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_srv_ni *sni;
+ int i, avail = 0;
+ int low_water = svc->srv_nbuf_per_group/2;
+
+ for (i = 0; i < ptlrpc_ninterfaces; i++) {
+ sni = &svc->srv_interfaces[i];
+
+ avail += sni->sni_nrqbd_receiving;
+ /* NB I'm not locking; just looking. */
+ if (sni->sni_nrqbd_receiving <= low_water)
+ ptlrpc_grow_req_bufs(sni);
+ }
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_TOTAL_CNTR,
+ svc->srv_nbufs);
+}
+
static int
ptlrpc_retry_rqbds(void *arg)
{
(svc->srv_nthreads - 1))),
&lwi);
+ ptlrpc_check_rqbd_pools(svc);
+
if (!list_empty (&svc->srv_reply_queue))
ptlrpc_server_handle_reply (svc);
noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory
noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify
noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify
-noinst_PROGRAMS += rename_many
+noinst_PROGRAMS += openfilleddirunlink rename_many
# noinst_PROGRAMS += ldaptest
bin_PROGRAMS = mcreate munlink mkdirmany iopentest1 iopentest2
endif # TESTS
createtest_SOURCES = createtest.c
open_delay_SOURCES = open_delay.c
opendirunlink_SOURCES = opendirunlink.c
+openfilleddirunlink_SOURCES = openfilleddirunlink.c
opendevunlink_SOURCES = opendevunlink.c
fchdir_test_SOURCES = fchdir_test.c
getdents_SOURCES=getdents.c
touch $EXT2_DEV
mke2fs -j -F $EXT2_DEV 8000 > /dev/null
+umask 022
+
test_0() {
touch $DIR/f
$CHECKSTAT -t file $DIR/f || error
}
run_test 0 "touch .../f ; rm .../f ============================="
+test_0b() {
+ chmod 0755 $DIR || error
+ $CHECKSTAT -p 0755 $DIR || error
+}
+run_test 0b "chmod 0755 $DIR ============================="
+
test_1a() {
mkdir $DIR/d1
mkdir $DIR/d1/d2
}
run_test 31d "remove of open directory ========================="
+test_31e() { # bug 2904
+ check_kernel_version 34 || return 0
+ openfilleddirunlink $DIR/d31e || error
+}
+run_test 31e "remove of open non-empty directory ==============="
+
test_32a() {
echo "== more mountpoints and symlinks ================="
[ -e $DIR/d32a ] && rm -fr $DIR/d32a