4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
27 * This file is part of Lustre, http://www.lustre.org/
29 * snmp/lustre-snmp-trap.c
31 * Author: PJ Kirner <pjkirner@clusterfs.com>
35 * include important headers
38 #include <net-snmp/net-snmp-config.h>
39 #include <net-snmp/net-snmp-includes.h>
40 #include <net-snmp/agent/net-snmp-agent-includes.h>
46 #include <sys/types.h>
54 #include "lustre-snmp-util.h"
56 /**************************************************************************
58 *************************************************************************/
60 #define DEFAULT_POLL_INTERVAL_SECONDS 60
61 #define POLL_INTERVAL_ENV_VAR "LSNMP_POLL_INTERVAL"
62 #define SNMP_HEALTH_CHECK_TEST_FILE "LSNMP_HEALTH_CHECK_TEST_FILE"
64 /**************************************************************************
66 *************************************************************************/
68 static oid objid_snmptrap[] =
69 { 1,3,6,1,6,3,1,1,4,1,0};
70 static oid lustre_portals_trap[] =
71 { 1,3,6,1,4,1,13140,2,1,0,1};
72 static oid lustre_portals_trap_string[]=
73 { 1,3,6,1,4,1,13140,2,1,0,2};
74 static oid lustre_unhealthy_trap[] =
75 { 1,3,6,1,4,1,13140,2,1,0,3};
76 static oid lustre_unhealthy_trap_device_name_string[]=
77 { 1,3,6,1,4,1,13140,2,1,0,4};
78 static oid lustre_unhealthy_trap_reason_string[]=
79 { 1,3,6,1,4,1,13140,2,1,0,5};
81 /**************************************************************************
83 *************************************************************************/
85 typedef struct obd_unhealthy_entry_struct{
87 /*1-if seen as part of the the is_unhealthy scan, otherwise 0*/
90 /*single linked list pointer*/
91 struct obd_unhealthy_entry_struct *next;
93 /*obdname - variable size*/
98 /**************************************************************************
100 *************************************************************************/
102 int get_poll_interval_seconds();
103 void health_poll_worker(unsigned int registration_number, void *clientarg);
104 void send_portals_catastrophe_trap(char *reason_string);
105 void send_obd_unhealthy_trap(char *obd_name,char *reason_string);
106 int is_obd_newly_unhealthy(const char* obd_name);
107 void obd_unhealthy_scan(void);
108 void health_entry_parser(void);
110 /**************************************************************************
112 *************************************************************************/
114 static int g_sent_portals_catastrophe = 0;
115 static obd_unhealthy_entry* g_obd_unhealthy_list = NULL;
116 static int g_poll_interval_seconds;
117 static unsigned int g_registration_handle;
118 static char *g_health_check_test_file = 0;
120 /*****************************************************************************
121 * Function: initialize_trap_handler
123 * Description: Initlized the trap poll haalder.
127 * Output: Global g_poll_interval_seconds is set.
129 ****************************************************************************/
131 void initialize_trap_handler(void)
133 g_poll_interval_seconds = get_poll_interval_seconds();
135 g_registration_handle = snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL);
136 if (g_registration_handle == 0)
137 report("%s %s: line %d %s", __FILE__, __FUNCTION__, __LINE__,
138 "snmp_alarm_register failed");
140 DEBUGMSGTL(("lsnmpd","lsnmp alarm registered poll interval = %d seconds\n",g_poll_interval_seconds));
142 g_health_check_test_file = getenv(SNMP_HEALTH_CHECK_TEST_FILE);
143 if(g_health_check_test_file != 0)
144 DEBUGMSGTL(("lsnmpd","lsnmp health check test file set to \'%s\'\n",g_health_check_test_file));
147 /*****************************************************************************
148 * Function: terminate_trap_handler
150 * Description: Terminate the trap poll haalder.
154 * Output: Global g_poll_interval_seconds is set.
156 ****************************************************************************/
158 void terminate_trap_handler(void)
160 snmp_alarm_unregister(g_registration_handle);
163 /*****************************************************************************
164 * Function: get_poll_interval_seconds
166 * Description: This function used to get the poll period for timer, which
167 * is used to read throughput values periodically.
169 * Output: Alarm period, default value(if env var not set) otherwise.
170 ****************************************************************************/
172 int get_poll_interval_seconds()
175 int ret_val = DEFAULT_POLL_INTERVAL_SECONDS;
177 /* Get Alarm period for reading the Lustre client table. */
179 alarm_period = getenv(POLL_INTERVAL_ENV_VAR);
180 if (alarm_period != NULL) {
181 char *ptr = alarm_period;
182 while(isdigit(*ptr)) ptr++;
184 /* if we have only digits then conver it*/
186 int time = atoi(alarm_period);
188 ret_val = time; /* Alarm period in seconds */
194 /*****************************************************************************
195 * Function: health_poll_worker
197 * Description: This is the routine registered to system timer for updating
198 * the throughput values for all the clients and its respective osc(s).
200 * Input: 'registration_number` value obtained during the alarm registration
201 * 'clientarg' pointing to user defined data type.
203 *****************************************************************************/
205 void health_poll_worker(unsigned int registration_number, void *clientarg)
207 health_entry_parser();
209 /* Register the function again to call after lustre_alarm_period */
210 if (!snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL)) {
211 report("%s %s:line %d %s", __FILE__, __FUNCTION__, __LINE__,
212 "snmp_alarm_register failed");
216 /*****************************************************************************
217 * Function: health_entry_parser
219 * Description: This routine is called to parse the health_check entry
223 *****************************************************************************/
225 void health_entry_parser(void)
228 char string[MAX_LINE_SIZE];
229 int b_seen_portals_catastrophe = 0;
233 if (cfs_get_param_paths(&path, "health_check") != 0)
236 filename = g_health_check_test_file == 0 ? path.gl_pathv[0] : g_health_check_test_file;
238 /*DEBUGMSGTL(("lsnmpd","health_entry_parser(%s)\n",filename));*/
240 /* Open the file. Use the test file env variable if
242 fptr = fopen(filename,"r");
244 /* Free parameter's path string */
245 cfs_free_param_data(&path);
247 /* If the path is not found do nothing */
251 while( NULL != fgets(string, sizeof(string), fptr)){
253 /*DEBUGMSGTL(("lsnmpd","health_entry_parser() looking at = \'%s\'\n",string));*/
256 * First handle the portals catastrophe
257 * Look for the string "LBUG"
259 if(0 == strncmp(string,"LBUG",4)){
261 * If we haven't sent the catastrophe message yet
262 * send it now. And keep track that we've sent it
264 if(!g_sent_portals_catastrophe){
265 send_portals_catastrophe_trap("LBUG");
266 g_sent_portals_catastrophe = 1;
268 b_seen_portals_catastrophe = 1;
272 * Now handle any of the OBD object failures
273 * look for "device <OBDNAME> reported unhealthy"
275 else if(0 == strncmp(string,"device ",7)){
276 char *obd_name = string+7;
277 char *space_after_obd_name;
280 * Now find the space after the obd name
281 * Again if there is no space we're in trouble
283 space_after_obd_name = strchr(obd_name,' ');
284 if(space_after_obd_name == 0)
288 * Null terminate the obd_name
290 *space_after_obd_name = 0;
292 DEBUGMSGTL(("lsnmpd","Looking at obd=%s\n",obd_name));
295 * If we haven't sent a trap for this one
298 if(is_obd_newly_unhealthy(obd_name))
299 send_obd_unhealthy_trap(obd_name,"unhealthy");
303 /* If we don't find it reset the catastrope flag*/
304 if(!b_seen_portals_catastrophe && g_sent_portals_catastrophe)
306 DEBUGMSGTL(("lsnmpd","LBUG has been cleared\n"));
307 g_sent_portals_catastrophe = 0;
311 * Any <OBDNAMES> that weren't queried above are now unhealthy.
312 * Scan through and cleanup the newly healthy obds
314 obd_unhealthy_scan();
319 /*****************************************************************************
320 * Function: send_portals_catastrophe_trap
322 * Description: Send the SNMP V2 trap
324 * Input: 'reason_string' the reason for the catastrope.
327 *****************************************************************************/
329 void send_portals_catastrophe_trap(char *reason_string)
332 * Setup the trap variables.
333 * It's a linked list of netsnmp_variable_list items.
335 netsnmp_variable_list var_trap[2];
337 DEBUGMSGTL(("lsnmpd","Sending portals catastrophe trap reason=%s\n",reason_string));
340 * Setup the first variable in the trap data.
341 * Have it chain to another variable.
343 var_trap[0].next_variable = &var_trap[1];
345 /*The "name" must be the standard snmp "trap" OID.*/
346 var_trap[0].name = objid_snmptrap;
347 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
349 /*But the data contained in this variable, is an OID that is the trap OID.*/
350 var_trap[0].type = ASN_OBJECT_ID;
351 var_trap[0].val.objid = lustre_portals_trap;
352 var_trap[0].val_len = sizeof(lustre_portals_trap);
355 * Setup the second variable in the trap data.
356 * It is the last in the chain so set next to NULL
358 var_trap[1].next_variable = NULL;
360 /* The "name" is the OID of the portals trap reason string */
361 var_trap[1].name = lustre_portals_trap_string;
362 var_trap[1].name_length = sizeof(lustre_portals_trap_string) / sizeof(oid);
364 /* And the data is an octet string, that contains the actually reason
366 var_trap[1].type = ASN_OCTET_STR;
367 var_trap[1].val.string = (unsigned char *)reason_string;
368 var_trap[1].val_len = strlen(reason_string);
370 /*And now send off the trap*/
371 send_v2trap(var_trap);
375 /*****************************************************************************
376 * Function: send_obd_unhealthy_trap
378 * Description: Send the SNMP V2 trap
380 * Input: 'obd_name' the name of the obd
381 * 'reason_string' the reason for the catastrope.
383 *****************************************************************************/
385 void send_obd_unhealthy_trap(char *obd_name,char *reason_string)
388 * Setup the trap variables.
389 * It's a linked list of netsnmp_variable_list items.
391 netsnmp_variable_list var_trap[3];
393 DEBUGMSGTL(("lsnmpd","Sending OBD unhealthy trap obd=%s reason=%s\n",obd_name,reason_string));
396 * Setup the first variable in the trap data.
397 * Have it chain to another variable.
399 var_trap[0].next_variable = &var_trap[1];
401 /*The "name" must be the standard snmp "trap" OID.*/
402 var_trap[0].name = objid_snmptrap;
403 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
405 /*But the data contained in this variable, is an OID that is the trap OID.*/
406 var_trap[0].type = ASN_OBJECT_ID;
407 var_trap[0].val.objid = lustre_unhealthy_trap;
408 var_trap[0].val_len = sizeof(lustre_unhealthy_trap);
411 * Setup the second variable in the trap data.
412 * Have it chain to another variable.
414 var_trap[1].next_variable = &var_trap[2];;
416 /* The "name" is the OID of the portals trap reason string */
417 var_trap[1].name = lustre_unhealthy_trap_device_name_string;
418 var_trap[1].name_length = sizeof(lustre_unhealthy_trap_device_name_string) / sizeof(oid);
420 /* And the data is an octet string, that contains the actual reason
422 var_trap[1].type = ASN_OCTET_STR;
423 var_trap[1].val.string = (unsigned char *)obd_name;
424 var_trap[1].val_len = strlen(obd_name);
427 * Setup the third variable in the trap data.
428 * It is the last in the chain so set next to NULL
430 var_trap[2].next_variable = NULL;
432 /* The "name" is the OID of the portals trap reason string */
433 var_trap[2].name = lustre_unhealthy_trap_reason_string;
434 var_trap[2].name_length = sizeof(lustre_unhealthy_trap_reason_string) / sizeof(oid);
436 /* And the data is an octet string, that contains the actual reason
438 var_trap[2].type = ASN_OCTET_STR;
439 var_trap[2].val.string = (unsigned char *)reason_string;
440 var_trap[2].val_len = strlen(reason_string);
442 /*And now send off the trap*/
443 send_v2trap(var_trap);
447 /*****************************************************************************
448 * Function: is_obd_newly_unhealthy
450 * Description: Deterime if the obd is going from health->unhealth
451 * Also mark all unhealhy (new and old) as seen.
453 * Input: 'obd_name' the name of the obd
455 * Output: 1 if newly unhealthy 0 if previolsy unhealthy
456 *****************************************************************************/
458 int is_obd_newly_unhealthy(const char* obd_name)
460 /*for all elements in g_obd_unhealthy_list*/
461 obd_unhealthy_entry* walker;
462 obd_unhealthy_entry* entry;
465 for(walker = g_obd_unhealthy_list; walker != 0; walker = walker->next)
467 /*If the names match*/
468 if(0 == strcmp (walker->name,obd_name))
470 /* Commented out because it was just to noisy!
471 * DEBUGMSGTL(("lsnmpd","obd %s was already unhealthy\n",obd_name));
474 /*Mark the entry as seen, and return that it was previously unhealthy*/
480 DEBUGMSGTL(("lsnmpd","obd %s is now unhealthy\n",obd_name));
482 /*We didn't find an entry so we need to create a new one. */
483 /*Calculate the obd_name length*/
484 name_len = strlen(obd_name)+1;
486 /*Allocate a new entry*/
487 entry = malloc(sizeof(*entry) + name_len);
489 /*Put this element at the front of the list*/
490 entry->next = g_obd_unhealthy_list;
491 g_obd_unhealthy_list = entry;
493 /*Mark it initially as seen*/
496 /*And copy the entry name*/
497 memcpy(entry->name,obd_name,name_len);
499 /*return this obd as newly unhealthy.*/
504 /*****************************************************************************
505 * Function: obd_unhealthy_scan
507 * Description: Deterime if any obd is going from unhealthy->healthy
508 * Any of the obds that weren't "seen" by the
509 * is_obd_newly_unhealthy() pass are now health so
510 * remove them from the lists
511 * Also clear all "seen" flags.
515 *****************************************************************************/
517 void obd_unhealthy_scan(void)
519 /*fore all elements in g_obd_unhealthy_list*/
520 obd_unhealthy_entry* walker = g_obd_unhealthy_list;
521 obd_unhealthy_entry* prev = 0;
524 /*remove any that was not seen as unhealthy the last time*/
525 if(walker->seen == 0)
527 /*Remove element from the list, but first fix up the walker pointer*/
528 obd_unhealthy_entry* temp = walker;
530 DEBUGMSGTL(("lsnmpd","obd %s is now healthy\n",walker->name));
532 walker = walker->next;
534 /*Now adjust the pointers to effectively remove this entry*/
536 g_obd_unhealthy_list = walker;
540 /*And free the pointer. */
542 /*walker and prev are correctly setup so we can go around the loop again.*/
545 /*Mark all other entries as NOT seen for next pass through*/
549 /*Go onto the next entry*/
551 walker = walker->next;