1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2005 Cluster File Systems, Inc.
5 * Author: PJ Kirner <pjkirner@clusterfs.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 * include important headers
27 #include <net-snmp/net-snmp-config.h>
28 #include <net-snmp/net-snmp-includes.h>
29 #include <net-snmp/agent/net-snmp-agent-includes.h>
35 #include <sys/types.h>
36 #if defined (__linux__)
44 #include "lustre-snmp-util.h"
46 /**************************************************************************
48 *************************************************************************/
50 #define DEFAULT_POLL_INTERVAL_SECONDS 60
51 #define POLL_INTERVAL_ENV_VAR "LSNMP_POLL_INTERVAL"
52 #define SNMP_HEALTH_CHECK_TEST_FILE "LSNMP_HEALTH_CHECK_TEST_FILE"
54 /**************************************************************************
56 *************************************************************************/
58 static oid objid_snmptrap[] =
59 { 1,3,6,1,6,3,1,1,4,1,0};
60 static oid lustre_portals_trap[] =
61 { 1,3,6,1,4,1,13140,2,1,0,1};
62 static oid lustre_portals_trap_string[]=
63 { 1,3,6,1,4,1,13140,2,1,0,2};
64 static oid lustre_unhealthy_trap[] =
65 { 1,3,6,1,4,1,13140,2,1,0,3};
66 static oid lustre_unhealthy_trap_device_name_string[]=
67 { 1,3,6,1,4,1,13140,2,1,0,4};
68 static oid lustre_unhealthy_trap_reason_string[]=
69 { 1,3,6,1,4,1,13140,2,1,0,5};
71 /**************************************************************************
73 *************************************************************************/
75 typedef struct obd_unhealthy_entry_struct{
77 /*1-if seen as part of the the is_unhealthy scan, otherwise 0*/
80 /*single linked list pointer*/
81 struct obd_unhealthy_entry_struct *next;
83 /*obdname - variable size*/
88 /**************************************************************************
90 *************************************************************************/
92 int get_poll_interval_seconds();
93 void health_poll_worker(unsigned int registration_number, void *clientarg);
94 void send_portals_catastrophe_trap(char *reason_string);
95 void send_obd_unhealthy_trap(char *obd_name,char *reason_string);
96 int is_obd_newly_unhealthy(const char* obd_name);
97 void obd_unhealthy_scan(void);
98 void health_entry_parser(void);
100 /**************************************************************************
102 *************************************************************************/
104 static int g_sent_portals_catastrophe = 0;
105 static obd_unhealthy_entry* g_obd_unhealthy_list = NULL;
106 static int g_poll_interval_seconds;
107 static unsigned int g_registration_handle;
108 static char *g_health_check_test_file = 0;
110 /*****************************************************************************
111 * Function: initilize_trap_handler
113 * Description: Initlized the trap poll haalder.
117 * Output: Global g_poll_interval_seconds is set.
119 ****************************************************************************/
121 void initilize_trap_handler(void)
123 g_poll_interval_seconds = get_poll_interval_seconds();
125 g_registration_handle = snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL);
126 if (g_registration_handle == 0)
127 report("%s %s: line %d %s", __FILE__, __FUNCTION__, __LINE__,
128 "snmp_alarm_register failed");
130 DEBUGMSGTL(("lsnmpd","lsnmp alarm registered poll interval = %d seconds\n",g_poll_interval_seconds));
132 g_health_check_test_file = getenv(SNMP_HEALTH_CHECK_TEST_FILE);
133 if(g_health_check_test_file != 0)
134 DEBUGMSGTL(("lsnmpd","lsnmp health check test file set to \'%s\'\n",g_health_check_test_file));
137 /*****************************************************************************
138 * Function: terminate_trap_handler
140 * Description: Terminate the trap poll haalder.
144 * Output: Global g_poll_interval_seconds is set.
146 ****************************************************************************/
148 void terminate_trap_handler(void)
150 snmp_alarm_unregister(g_registration_handle);
153 /*****************************************************************************
154 * Function: get_poll_interval_seconds
156 * Description: This function used to get the poll period for timer, which
157 * is used to read throughput values periodically.
159 * Output: Alarm period, default value(if env var not set) otherwise.
160 ****************************************************************************/
162 int get_poll_interval_seconds()
165 int ret_val = DEFAULT_POLL_INTERVAL_SECONDS;
167 /* Get Alarm period for reading the Lustre client table. */
169 alarm_period = getenv(POLL_INTERVAL_ENV_VAR);
170 if (alarm_period != NULL) {
171 char *ptr = alarm_period;
172 while(isdigit(*ptr)) ptr++;
174 /* if we have only digits then conver it*/
176 int time = atoi(alarm_period);
178 ret_val = time; /* Alarm period in seconds */
184 /*****************************************************************************
185 * Function: health_poll_worker
187 * Description: This is the routine registered to system timer for updating
188 * the throughput values for all the clients and its respective osc(s).
190 * Input: 'registration_number` value obtained during the alarm registration
191 * 'clientarg' pointing to user defined data type.
193 *****************************************************************************/
195 void health_poll_worker(unsigned int registration_number, void *clientarg)
197 health_entry_parser();
199 /* Register the function again to call after lustre_alarm_period */
200 if (!snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL)) {
201 report("%s %s:line %d %s", __FILE__, __FUNCTION__, __LINE__,
202 "snmp_alarm_register failed");
206 /*****************************************************************************
207 * Function: health_entry_parser
209 * Description: This routine is called to parse the health_check entry
213 *****************************************************************************/
215 void health_entry_parser(void)
218 char string[MAX_LINE_SIZE];
219 int b_seen_portals_catastrophe = 0;
220 const char *filename = g_health_check_test_file == 0 ?
221 LUSTRE_PATH FILENAME_SYSHEALTHCHECK :
222 g_health_check_test_file;
224 /*DEBUGMSGTL(("lsnmpd","health_entry_parser(%s)\n",filename));*/
226 /* Open the file. Use the test file env variable if
228 fptr = fopen(filename,"r");
230 /* If the path is not found do nothing */
234 while( NULL != fgets(string, sizeof(string), fptr)){
236 /*DEBUGMSGTL(("lsnmpd","health_entry_parser() looking at = \'%s\'\n",string));*/
239 * First handle the portals catastrophe
240 * Look for the string "LBUG"
242 if(0 == strncmp(string,"LBUG",4)){
244 * If we haven't sent the catastrophe message yet
245 * send it now. And keep track that we've sent it
247 if(!g_sent_portals_catastrophe){
248 send_portals_catastrophe_trap("LBUG");
249 g_sent_portals_catastrophe = 1;
251 b_seen_portals_catastrophe = 1;
255 * Now handle any of the OBD object failures
256 * look for "device <OBDNAME> reported unhealthy"
258 else if(0 == strncmp(string,"device ",7)){
259 char *obd_name = string+7;
260 char *space_after_obd_name;
263 * Now find the space after the obd name
264 * Again if there is no space we're in trouble
266 space_after_obd_name = strchr(obd_name,' ');
267 if(space_after_obd_name == 0)
271 * Null terminate the obd_name
273 *space_after_obd_name = 0;
275 DEBUGMSGTL(("lsnmpd","Looking at obd=%s\n",obd_name));
278 * If we haven't sent a trap for this one
281 if(is_obd_newly_unhealthy(obd_name))
282 send_obd_unhealthy_trap(obd_name,"unhealthy");
286 /* If we don't find it reset the catastrope flag*/
287 if(!b_seen_portals_catastrophe && g_sent_portals_catastrophe)
289 DEBUGMSGTL(("lsnmpd","LBUG has been cleared\n"));
290 g_sent_portals_catastrophe = 0;
294 * Any <OBDNAMES> that weren't queried above are now unhealthy.
295 * Scan through and cleanup the newly healthy obds
297 obd_unhealthy_scan();
302 /*****************************************************************************
303 * Function: send_portals_catastrophe_trap
305 * Description: Send the SNMP V2 trap
307 * Input: 'reason_string' the reason for the catastrope.
310 *****************************************************************************/
312 void send_portals_catastrophe_trap(char *reason_string)
315 * Setup the trap variables.
316 * It's a linked list of netsnmp_variable_list items.
318 netsnmp_variable_list var_trap[2];
320 DEBUGMSGTL(("lsnmpd","Sending portals catastrophe trap reason=%s\n",reason_string));
323 * Setup the first variable in the trap data.
324 * Have it chain to another variable.
326 var_trap[0].next_variable = &var_trap[1];
328 /*The "name" must be the standard snmp "trap" OID.*/
329 var_trap[0].name = objid_snmptrap;
330 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
332 /*But the data contained in this variable, is an OID that is the trap OID.*/
333 var_trap[0].type = ASN_OBJECT_ID;
334 var_trap[0].val.objid = lustre_portals_trap;
335 var_trap[0].val_len = sizeof(lustre_portals_trap);
338 * Setup the second variable in the trap data.
339 * It is the last in the chain so set next to NULL
341 var_trap[1].next_variable = NULL;
343 /*The "name" is the OID of the portals trap reason strong*/
344 var_trap[1].name = lustre_portals_trap_string;
345 var_trap[1].name_length = sizeof(lustre_portals_trap_string) / sizeof(oid);
347 /*And the data is a octet string, that contains the actually reason string*/
348 var_trap[1].type = ASN_OCTET_STR;
349 var_trap[1].val.string = reason_string;
350 var_trap[1].val_len = strlen(reason_string);
352 /*And now send off the trap*/
353 send_v2trap(var_trap);
357 /*****************************************************************************
358 * Function: send_obd_unhealthy_trap
360 * Description: Send the SNMP V2 trap
362 * Input: 'obd_name' the name of the obd
363 * 'reason_string' the reason for the catastrope.
365 *****************************************************************************/
367 void send_obd_unhealthy_trap(char *obd_name,char *reason_string)
370 * Setup the trap variables.
371 * It's a linked list of netsnmp_variable_list items.
373 netsnmp_variable_list var_trap[3];
375 DEBUGMSGTL(("lsnmpd","Sending OBD unhealthy trap obd=%s reason=%s\n",obd_name,reason_string));
378 * Setup the first variable in the trap data.
379 * Have it chain to another variable.
381 var_trap[0].next_variable = &var_trap[1];
383 /*The "name" must be the standard snmp "trap" OID.*/
384 var_trap[0].name = objid_snmptrap;
385 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
387 /*But the data contained in this variable, is an OID that is the trap OID.*/
388 var_trap[0].type = ASN_OBJECT_ID;
389 var_trap[0].val.objid = lustre_unhealthy_trap;
390 var_trap[0].val_len = sizeof(lustre_unhealthy_trap);
393 * Setup the second variable in the trap data.
394 * Have it chain to another variable.
396 var_trap[1].next_variable = &var_trap[2];;
398 /*The "name" is the OID of the portals trap reason strong*/
399 var_trap[1].name = lustre_unhealthy_trap_device_name_string;
400 var_trap[1].name_length = sizeof(lustre_unhealthy_trap_device_name_string) / sizeof(oid);
402 /*And the data is a octet string, that contains the actually reason strong*/
403 var_trap[1].type = ASN_OCTET_STR;
404 var_trap[1].val.string = obd_name;
405 var_trap[1].val_len = strlen(obd_name);
408 * Setup the third variable in the trap data.
409 * It is the last in the chain so set next to NULL
411 var_trap[2].next_variable = NULL;
413 /*The "name" is the OID of the portals trap reason strong*/
414 var_trap[2].name = lustre_unhealthy_trap_reason_string;
415 var_trap[2].name_length = sizeof(lustre_unhealthy_trap_reason_string) / sizeof(oid);
417 /*And the data is a octet string, that contains the actually reason strong*/
418 var_trap[2].type = ASN_OCTET_STR;
419 var_trap[2].val.string = reason_string;
420 var_trap[2].val_len = strlen(reason_string);
422 /*And now send off the trap*/
423 send_v2trap(var_trap);
427 /*****************************************************************************
428 * Function: is_obd_newly_unhealthy
430 * Description: Deterime if the obd is going from health->unhealth
431 * Also mark all unhealhy (new and old) as seen.
433 * Input: 'obd_name' the name of the obd
435 * Output: 1 if newly unhealthy 0 if previolsy unhealthy
436 *****************************************************************************/
438 int is_obd_newly_unhealthy(const char* obd_name)
440 /*for all elements in g_obd_unhealthy_list*/
441 obd_unhealthy_entry* walker;
442 obd_unhealthy_entry* entry;
445 for(walker = g_obd_unhealthy_list; walker != 0; walker = walker->next)
447 /*If the names match*/
448 if(0 == strcmp (walker->name,obd_name))
450 /* Commented out because it was just to noisy!
451 * DEBUGMSGTL(("lsnmpd","obd %s was already unhealthy\n",obd_name));
454 /*Mark the entry as seen, and return that it was previously unhealthy*/
460 DEBUGMSGTL(("lsnmpd","obd %s is now unhealthy\n",obd_name));
462 /*We didn't find an entry so we need to create a new one. */
463 /*Calculate the obd_name length*/
464 name_len = strlen(obd_name)+1;
466 /*Allocate a new entry*/
467 entry = malloc(sizeof(*entry) + name_len);
469 /*Put this element at the front of the list*/
470 entry->next = g_obd_unhealthy_list;
471 g_obd_unhealthy_list = entry;
473 /*Mark it initially as seen*/
476 /*And copy the entry name*/
477 memcpy(entry->name,obd_name,name_len);
479 /*return this obd as newly unhealthy.*/
484 /*****************************************************************************
485 * Function: obd_unhealthy_scan
487 * Description: Deterime if any obd is going from unhealthy->healthy
488 * Any of the obds that weren't "seen" by the
489 * is_obd_newly_unhealthy() pass are now health so
490 * remove them from the lists
491 * Also clear all "seen" flags.
495 *****************************************************************************/
497 void obd_unhealthy_scan(void)
499 /*fore all elements in g_obd_unhealthy_list*/
500 obd_unhealthy_entry* walker = g_obd_unhealthy_list;
501 obd_unhealthy_entry* prev = 0;
504 /*remove any that was not seen as unhealthy the last time*/
505 if(walker->seen == 0)
507 /*Remove element from the list, but first fix up the walker pointer*/
508 obd_unhealthy_entry* temp = walker;
510 DEBUGMSGTL(("lsnmpd","obd %s is now healthy\n",walker->name));
512 walker = walker->next;
514 /*Now adjust the pointers to effectively remove this entry*/
516 g_obd_unhealthy_list = walker;
520 /*And free the pointer. */
522 /*walker and prev are correctly setup so we can go around the loop again.*/
525 /*Mark all other entries as NOT seen for next pass through*/
529 /*Go onto the next entry*/
531 walker = walker->next;