1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * snmp/lustre-snmp-trap.c
38 * Author: PJ Kirner <pjkirner@clusterfs.com>
42 * include important headers
45 #include <net-snmp/net-snmp-config.h>
46 #include <net-snmp/net-snmp-includes.h>
47 #include <net-snmp/agent/net-snmp-agent-includes.h>
53 #include <sys/types.h>
54 #if defined (__linux__)
63 #include "lustre-snmp-util.h"
65 /**************************************************************************
67 *************************************************************************/
69 #define DEFAULT_POLL_INTERVAL_SECONDS 60
70 #define POLL_INTERVAL_ENV_VAR "LSNMP_POLL_INTERVAL"
71 #define SNMP_HEALTH_CHECK_TEST_FILE "LSNMP_HEALTH_CHECK_TEST_FILE"
73 /**************************************************************************
75 *************************************************************************/
77 static oid objid_snmptrap[] =
78 { 1,3,6,1,6,3,1,1,4,1,0};
79 static oid lustre_portals_trap[] =
80 { 1,3,6,1,4,1,13140,2,1,0,1};
81 static oid lustre_portals_trap_string[]=
82 { 1,3,6,1,4,1,13140,2,1,0,2};
83 static oid lustre_unhealthy_trap[] =
84 { 1,3,6,1,4,1,13140,2,1,0,3};
85 static oid lustre_unhealthy_trap_device_name_string[]=
86 { 1,3,6,1,4,1,13140,2,1,0,4};
87 static oid lustre_unhealthy_trap_reason_string[]=
88 { 1,3,6,1,4,1,13140,2,1,0,5};
90 /**************************************************************************
92 *************************************************************************/
94 typedef struct obd_unhealthy_entry_struct{
96 /*1-if seen as part of the the is_unhealthy scan, otherwise 0*/
99 /*single linked list pointer*/
100 struct obd_unhealthy_entry_struct *next;
102 /*obdname - variable size*/
105 }obd_unhealthy_entry;
107 /**************************************************************************
109 *************************************************************************/
111 int get_poll_interval_seconds();
112 void health_poll_worker(unsigned int registration_number, void *clientarg);
113 void send_portals_catastrophe_trap(char *reason_string);
114 void send_obd_unhealthy_trap(char *obd_name,char *reason_string);
115 int is_obd_newly_unhealthy(const char* obd_name);
116 void obd_unhealthy_scan(void);
117 void health_entry_parser(void);
119 /**************************************************************************
121 *************************************************************************/
123 static int g_sent_portals_catastrophe = 0;
124 static obd_unhealthy_entry* g_obd_unhealthy_list = NULL;
125 static int g_poll_interval_seconds;
126 static unsigned int g_registration_handle;
127 static char *g_health_check_test_file = 0;
129 /*****************************************************************************
130 * Function: initilize_trap_handler
132 * Description: Initlized the trap poll haalder.
136 * Output: Global g_poll_interval_seconds is set.
138 ****************************************************************************/
140 void initilize_trap_handler(void)
142 g_poll_interval_seconds = get_poll_interval_seconds();
144 g_registration_handle = snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL);
145 if (g_registration_handle == 0)
146 report("%s %s: line %d %s", __FILE__, __FUNCTION__, __LINE__,
147 "snmp_alarm_register failed");
149 DEBUGMSGTL(("lsnmpd","lsnmp alarm registered poll interval = %d seconds\n",g_poll_interval_seconds));
151 g_health_check_test_file = getenv(SNMP_HEALTH_CHECK_TEST_FILE);
152 if(g_health_check_test_file != 0)
153 DEBUGMSGTL(("lsnmpd","lsnmp health check test file set to \'%s\'\n",g_health_check_test_file));
156 /*****************************************************************************
157 * Function: terminate_trap_handler
159 * Description: Terminate the trap poll haalder.
163 * Output: Global g_poll_interval_seconds is set.
165 ****************************************************************************/
167 void terminate_trap_handler(void)
169 snmp_alarm_unregister(g_registration_handle);
172 /*****************************************************************************
173 * Function: get_poll_interval_seconds
175 * Description: This function used to get the poll period for timer, which
176 * is used to read throughput values periodically.
178 * Output: Alarm period, default value(if env var not set) otherwise.
179 ****************************************************************************/
181 int get_poll_interval_seconds()
184 int ret_val = DEFAULT_POLL_INTERVAL_SECONDS;
186 /* Get Alarm period for reading the Lustre client table. */
188 alarm_period = getenv(POLL_INTERVAL_ENV_VAR);
189 if (alarm_period != NULL) {
190 char *ptr = alarm_period;
191 while(isdigit(*ptr)) ptr++;
193 /* if we have only digits then conver it*/
195 int time = atoi(alarm_period);
197 ret_val = time; /* Alarm period in seconds */
203 /*****************************************************************************
204 * Function: health_poll_worker
206 * Description: This is the routine registered to system timer for updating
207 * the throughput values for all the clients and its respective osc(s).
209 * Input: 'registration_number` value obtained during the alarm registration
210 * 'clientarg' pointing to user defined data type.
212 *****************************************************************************/
214 void health_poll_worker(unsigned int registration_number, void *clientarg)
216 health_entry_parser();
218 /* Register the function again to call after lustre_alarm_period */
219 if (!snmp_alarm_register(g_poll_interval_seconds, 0, health_poll_worker, NULL)) {
220 report("%s %s:line %d %s", __FILE__, __FUNCTION__, __LINE__,
221 "snmp_alarm_register failed");
225 /*****************************************************************************
226 * Function: health_entry_parser
228 * Description: This routine is called to parse the health_check entry
232 *****************************************************************************/
234 void health_entry_parser(void)
237 char string[MAX_LINE_SIZE];
238 int b_seen_portals_catastrophe = 0;
239 const char *filename = g_health_check_test_file == 0 ?
240 LUSTRE_PATH FILENAME_SYSHEALTHCHECK :
241 g_health_check_test_file;
243 /*DEBUGMSGTL(("lsnmpd","health_entry_parser(%s)\n",filename));*/
245 /* Open the file. Use the test file env variable if
247 fptr = fopen(filename,"r");
249 /* If the path is not found do nothing */
253 while( NULL != fgets(string, sizeof(string), fptr)){
255 /*DEBUGMSGTL(("lsnmpd","health_entry_parser() looking at = \'%s\'\n",string));*/
258 * First handle the portals catastrophe
259 * Look for the string "LBUG"
261 if(0 == strncmp(string,"LBUG",4)){
263 * If we haven't sent the catastrophe message yet
264 * send it now. And keep track that we've sent it
266 if(!g_sent_portals_catastrophe){
267 send_portals_catastrophe_trap("LBUG");
268 g_sent_portals_catastrophe = 1;
270 b_seen_portals_catastrophe = 1;
274 * Now handle any of the OBD object failures
275 * look for "device <OBDNAME> reported unhealthy"
277 else if(0 == strncmp(string,"device ",7)){
278 char *obd_name = string+7;
279 char *space_after_obd_name;
282 * Now find the space after the obd name
283 * Again if there is no space we're in trouble
285 space_after_obd_name = strchr(obd_name,' ');
286 if(space_after_obd_name == 0)
290 * Null terminate the obd_name
292 *space_after_obd_name = 0;
294 DEBUGMSGTL(("lsnmpd","Looking at obd=%s\n",obd_name));
297 * If we haven't sent a trap for this one
300 if(is_obd_newly_unhealthy(obd_name))
301 send_obd_unhealthy_trap(obd_name,"unhealthy");
305 /* If we don't find it reset the catastrope flag*/
306 if(!b_seen_portals_catastrophe && g_sent_portals_catastrophe)
308 DEBUGMSGTL(("lsnmpd","LBUG has been cleared\n"));
309 g_sent_portals_catastrophe = 0;
313 * Any <OBDNAMES> that weren't queried above are now unhealthy.
314 * Scan through and cleanup the newly healthy obds
316 obd_unhealthy_scan();
321 /*****************************************************************************
322 * Function: send_portals_catastrophe_trap
324 * Description: Send the SNMP V2 trap
326 * Input: 'reason_string' the reason for the catastrope.
329 *****************************************************************************/
331 void send_portals_catastrophe_trap(char *reason_string)
334 * Setup the trap variables.
335 * It's a linked list of netsnmp_variable_list items.
337 netsnmp_variable_list var_trap[2];
339 DEBUGMSGTL(("lsnmpd","Sending portals catastrophe trap reason=%s\n",reason_string));
342 * Setup the first variable in the trap data.
343 * Have it chain to another variable.
345 var_trap[0].next_variable = &var_trap[1];
347 /*The "name" must be the standard snmp "trap" OID.*/
348 var_trap[0].name = objid_snmptrap;
349 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
351 /*But the data contained in this variable, is an OID that is the trap OID.*/
352 var_trap[0].type = ASN_OBJECT_ID;
353 var_trap[0].val.objid = lustre_portals_trap;
354 var_trap[0].val_len = sizeof(lustre_portals_trap);
357 * Setup the second variable in the trap data.
358 * It is the last in the chain so set next to NULL
360 var_trap[1].next_variable = NULL;
362 /*The "name" is the OID of the portals trap reason strong*/
363 var_trap[1].name = lustre_portals_trap_string;
364 var_trap[1].name_length = sizeof(lustre_portals_trap_string) / sizeof(oid);
366 /*And the data is a octet string, that contains the actually reason string*/
367 var_trap[1].type = ASN_OCTET_STR;
368 var_trap[1].val.string = (unsigned char *)reason_string;
369 var_trap[1].val_len = strlen(reason_string);
371 /*And now send off the trap*/
372 send_v2trap(var_trap);
376 /*****************************************************************************
377 * Function: send_obd_unhealthy_trap
379 * Description: Send the SNMP V2 trap
381 * Input: 'obd_name' the name of the obd
382 * 'reason_string' the reason for the catastrope.
384 *****************************************************************************/
386 void send_obd_unhealthy_trap(char *obd_name,char *reason_string)
389 * Setup the trap variables.
390 * It's a linked list of netsnmp_variable_list items.
392 netsnmp_variable_list var_trap[3];
394 DEBUGMSGTL(("lsnmpd","Sending OBD unhealthy trap obd=%s reason=%s\n",obd_name,reason_string));
397 * Setup the first variable in the trap data.
398 * Have it chain to another variable.
400 var_trap[0].next_variable = &var_trap[1];
402 /*The "name" must be the standard snmp "trap" OID.*/
403 var_trap[0].name = objid_snmptrap;
404 var_trap[0].name_length = sizeof(objid_snmptrap) / sizeof(oid);
406 /*But the data contained in this variable, is an OID that is the trap OID.*/
407 var_trap[0].type = ASN_OBJECT_ID;
408 var_trap[0].val.objid = lustre_unhealthy_trap;
409 var_trap[0].val_len = sizeof(lustre_unhealthy_trap);
412 * Setup the second variable in the trap data.
413 * Have it chain to another variable.
415 var_trap[1].next_variable = &var_trap[2];;
417 /*The "name" is the OID of the portals trap reason strong*/
418 var_trap[1].name = lustre_unhealthy_trap_device_name_string;
419 var_trap[1].name_length = sizeof(lustre_unhealthy_trap_device_name_string) / sizeof(oid);
421 /*And the data is a octet string, that contains the actually reason strong*/
422 var_trap[1].type = ASN_OCTET_STR;
423 var_trap[1].val.string = (unsigned char *)obd_name;
424 var_trap[1].val_len = strlen(obd_name);
427 * Setup the third variable in the trap data.
428 * It is the last in the chain so set next to NULL
430 var_trap[2].next_variable = NULL;
432 /*The "name" is the OID of the portals trap reason strong*/
433 var_trap[2].name = lustre_unhealthy_trap_reason_string;
434 var_trap[2].name_length = sizeof(lustre_unhealthy_trap_reason_string) / sizeof(oid);
436 /*And the data is a octet string, that contains the actually reason strong*/
437 var_trap[2].type = ASN_OCTET_STR;
438 var_trap[2].val.string = (unsigned char *)reason_string;
439 var_trap[2].val_len = strlen(reason_string);
441 /*And now send off the trap*/
442 send_v2trap(var_trap);
446 /*****************************************************************************
447 * Function: is_obd_newly_unhealthy
449 * Description: Deterime if the obd is going from health->unhealth
450 * Also mark all unhealhy (new and old) as seen.
452 * Input: 'obd_name' the name of the obd
454 * Output: 1 if newly unhealthy 0 if previolsy unhealthy
455 *****************************************************************************/
457 int is_obd_newly_unhealthy(const char* obd_name)
459 /*for all elements in g_obd_unhealthy_list*/
460 obd_unhealthy_entry* walker;
461 obd_unhealthy_entry* entry;
464 for(walker = g_obd_unhealthy_list; walker != 0; walker = walker->next)
466 /*If the names match*/
467 if(0 == strcmp (walker->name,obd_name))
469 /* Commented out because it was just to noisy!
470 * DEBUGMSGTL(("lsnmpd","obd %s was already unhealthy\n",obd_name));
473 /*Mark the entry as seen, and return that it was previously unhealthy*/
479 DEBUGMSGTL(("lsnmpd","obd %s is now unhealthy\n",obd_name));
481 /*We didn't find an entry so we need to create a new one. */
482 /*Calculate the obd_name length*/
483 name_len = strlen(obd_name)+1;
485 /*Allocate a new entry*/
486 entry = malloc(sizeof(*entry) + name_len);
488 /*Put this element at the front of the list*/
489 entry->next = g_obd_unhealthy_list;
490 g_obd_unhealthy_list = entry;
492 /*Mark it initially as seen*/
495 /*And copy the entry name*/
496 memcpy(entry->name,obd_name,name_len);
498 /*return this obd as newly unhealthy.*/
503 /*****************************************************************************
504 * Function: obd_unhealthy_scan
506 * Description: Deterime if any obd is going from unhealthy->healthy
507 * Any of the obds that weren't "seen" by the
508 * is_obd_newly_unhealthy() pass are now health so
509 * remove them from the lists
510 * Also clear all "seen" flags.
514 *****************************************************************************/
516 void obd_unhealthy_scan(void)
518 /*fore all elements in g_obd_unhealthy_list*/
519 obd_unhealthy_entry* walker = g_obd_unhealthy_list;
520 obd_unhealthy_entry* prev = 0;
523 /*remove any that was not seen as unhealthy the last time*/
524 if(walker->seen == 0)
526 /*Remove element from the list, but first fix up the walker pointer*/
527 obd_unhealthy_entry* temp = walker;
529 DEBUGMSGTL(("lsnmpd","obd %s is now healthy\n",walker->name));
531 walker = walker->next;
533 /*Now adjust the pointers to effectively remove this entry*/
535 g_obd_unhealthy_list = walker;
539 /*And free the pointer. */
541 /*walker and prev are correctly setup so we can go around the loop again.*/
544 /*Mark all other entries as NOT seen for next pass through*/
548 /*Go onto the next entry*/
550 walker = walker->next;