Bugzilla : 10248
Description: Allow fractional MB tunings for lustre in /proc/ filesystem.
Details : Many of the /proc/ tunables can only be tuned at a megabyte
- granularity. Now, Fractional MB granularity is be supported,
- this is very useful for low memory system.
+ granularity. Now, Fractional MB granularity is be supported,
+ this is very useful for low memory system.
Severity : enhancement
Bugzilla : 9292
Bugzilla : 10409
Description: i_sem vs transaction deadlock in mds_obd_destroy during unlink.
Details : protect inode from truncation within vfs_unlink() context
- just take a reference before calling vfs_unlink() and release it
- when parent's i_sem is free.
+ just take a reference before calling vfs_unlink() and release it
+ when parent's i_sem is free.
+
+Severity : major
+Frequency : rare
+Bugzilla : 4778
+Description: last_id value checked outside lock on OST caused LASSERT failure
+Details : If there were multiple MDS->OST object precreate requests in
+ flight, it was possible that the OST's last object id was checked
+ outside a lock and incorrectly tripped an assertion. Move checks
+ inside locks, and discard old precreate requests.
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9387
+Description: import connection selection may be incorrect if timer wraps
+Details : Using a 32-bit jiffies timer with HZ=1000 may cause backup
+ import connections to be ignored if the 32-bit jiffies counter
+ wraps. Use a 64-bit jiffies counter.
+
+Severity : minor
+Frequency : very large clusters immediately after boot
+Bugzilla : 10083
+Description: LNET request buffers exhausted under heavy short-term load
+Details : If a large number of client requests are generated on a service
+ that has previously never seen so many requests it is possible
+ that the request buffer growth cannot keep up with the spike in
+ demand. Instead of dropping incoming requests, they are held in
+ the LND until the RPC service can accept more requests.
+
+Severity : minor
+Frequency : Sometimes during replay
+Bugzilla : 9314
+Description: Assertion failure in ll_local_open after replay.
+Details : If replay happened on an open request reply before we were able
+ to set replay handler, reply will become not swabbed tripping the
+ assertion in ll_local_open. Now we set the handler right after
+ recognising of open request
Severity : minor
Frequency : Sometimes during replay
RETURN (-EPROTO);
}
- /* If this is an successful OPEN request, we need to set
+ /* If this is a successful OPEN request, we need to set
replay handler and data early, so that if replay happens
- immediatelly after swabbing below, new reply is swabbed
+ immediately after swabbing below, new reply is swabbed
by that handler correctly */
if (it_disposition(it, DISP_OPEN_OPEN) &&
!it_open_error(DISP_OPEN_OPEN, it))
if (ev->unlinked) {
service->srv_nrqbd_receiving--;
- if (ev->type != LNET_EVENT_UNLINK &&
- service->srv_nrqbd_receiving == 0) {
- /* This service is off-air because all its request
- * buffers are busy. Portals will start dropping
- * incoming requests until more buffers get posted.
- * NB don't moan if it's because we're tearing down the
- * service. */
- CERROR("All %s request buffers busy\n",
+ CDEBUG(D_RPCTRACE,"Buffer complete: %d buffers still posted\n",
+ service->srv_nrqbd_receiving);
+
+ /* Normally, don't complain about 0 buffers posted; LNET won't
+ * drop incoming reqs since we set the portal lazy */
+ if (test_req_buffer_pressure &&
+ ev->type != LNET_EVENT_UNLINK &&
+ service->srv_nrqbd_receiving == 0)
+ CWARN("All %s request buffers busy\n",
service->srv_name);
- }
+
/* req takes over the network's ref on rqbd */
} else {
/* req takes a ref on rqbd */
imp->imp_obd->obd_name,
libcfs_nid2str(conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
-
- /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */
if (get_jiffies_64() >
conn->oic_last_attempt + RECONNECT_INTERVAL * HZ) {
-
/* If we have never tried this connection since the
the last successful attempt, go with this one */
if (conn->oic_last_attempt <=
}
/* Both of these connections have already been tried
- since the last successful connection, just choose the
+ since the last successful connection; just choose the
least recently used */
if (!imp_conn)
imp_conn = conn;
if (conn->oic_last_attempt <
imp_conn->oic_last_attempt)
imp_conn = conn;
- }
+ }
}
/* if not found, simply choose the current one */
struct obd_import;
struct ldlm_res_id;
struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
void lustre_assert_wire_constants(void);
#include <lnet/types.h>
#include "ptlrpc_internal.h"
+int test_req_buffer_pressure = 0;
+CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
+ "set non-zero to put pressure on request buffer pools");
+
/* forward ref */
static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
list_del(&rqbd->rqbd_list);
list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
- if (svc->srv_nrqbd_receiving == 0) {
- /* This service is off-air on this interface because all
- * its request buffers are busy. Portals will have started
- * dropping incoming requests until more buffers get
- * posted */
- CERROR("All %s request buffers busy\n", svc->srv_name);
- }
+ /* Don't complain if no request buffers are posted right now; LNET
+ * won't drop requests because we set the portal lazy! */
spin_unlock(&svc->srv_lock);
CFS_INIT_LIST_HEAD(&service->srv_threads);
cfs_waitq_init(&service->srv_waitq);
- service->srv_nbuf_per_group = nbufs;
+ service->srv_nbuf_per_group = test_req_buffer_pressure ? 1 : nbufs;
service->srv_max_req_size = max_req_size;
service->srv_buf_size = bufsize;
service->srv_rep_portal = rep_portal;
service->srv_request_max_cull_seq = 0;
service->srv_num_threads = num_threads;
+ rc = LNetSetLazyPortal(service->srv_req_portal);
+ LASSERT (rc == 0);
+
CFS_INIT_LIST_HEAD(&service->srv_request_queue);
CFS_INIT_LIST_HEAD(&service->srv_idle_rqbds);
CFS_INIT_LIST_HEAD(&service->srv_active_rqbds);
ptlrpc_check_rqbd_pool(struct ptlrpc_service *svc)
{
int avail = svc->srv_nrqbd_receiving;
- int low_water = svc->srv_nbuf_per_group/2;
+ int low_water = test_req_buffer_pressure ? 0 :
+ svc->srv_nbuf_per_group/2;
/* NB I'm not locking; just looking. */
* for a timeout (unless something else happens)
* before I try again */
svc->srv_rqbd_timeout = cfs_time_seconds(1)/10;
+ CDEBUG(D_RPCTRACE,"Posted buffers: %d\n",
+ svc->srv_nrqbd_receiving);
}
}
CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+ rc = LNetClearLazyPortal(service->srv_req_portal);
+ LASSERT (rc == 0);
+
/* Unlink all the request buffers. This forces a 'final' event with
* its 'unlink' flag set for each posted rqbd */
list_for_each(tmp, &service->srv_active_rqbds) {