[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
timestamp monitoring + code simplification patch
From: |
Martin Pala |
Subject: |
timestamp monitoring + code simplification patch |
Date: |
Wed, 27 Nov 2002 23:14:01 +0100 |
User-agent: |
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.1) Gecko/20020913 Debian/1.1-1 |
Hi,
i wrote feature for monit to allow monitoring timestamp of file or
directory recently.
I need this feature to watch health of iPlanet Messaging server stored
process (it is critical) - this process periodicaly updates timestamp of
3 independent state files. As soon as timestamp of any of these files is
older than expected, it signalizes, that one of tasks, that stored
deamon does failed and real hell is starting. New statement has
following syntax:
TIMESTAMP object [operator] value [unit] [action]
Complete description is in documentation included in patch (monit.pod).
It is very similar to resource monitoring options and it uses common
functions. It allows to monitor directories too, so you can watch if
that directory is in use (files are added or removed). You can alert,
restart or stop the process. iPlanet don't have any equivalent tool, so
maybe it can raise again monit's popularity little bit :)
I did lot of code simplification/cleanup for it and changed some
functions to be more general and to allow use for other monit parts too
(it affect monitrc file parser too).
There are yet few minor bugfixes, code beatification, etc.
I'm still busy (i hope it will be sometimes over), so i'm sending it as
one big patch, even it would be better to split it to more independent
parts for clearness.
What do you think about it - can i commit it (lets vote)?
Thanks,
Martin
diff -Naur monit/CHANGES.txt monit.cvs-20021127/CHANGES.txt
--- monit/CHANGES.txt 2002-11-18 22:54:05.000000000 +0100
+++ monit.cvs-20021127/CHANGES.txt 2002-11-27 22:27:20.000000000 +0100
@@ -21,6 +21,8 @@
* The location of the pidfile can be modified in the control file
with the 'set pidfile' statement.
+* This release supports timestamp monitoring for file or directory.
+
Version 3.0
diff -Naur monit/alert.c monit.cvs-20021127/alert.c
--- monit/alert.c 2002-11-03 22:50:30.000000000 +0100
+++ monit.cvs-20021127/alert.c 2002-11-27 22:27:20.000000000 +0100
@@ -31,12 +31,13 @@
#include "net.h"
#include "alert.h"
-#define DO_TIMEOUT 0
-#define DO_RESTART 1
-#define DO_CHECKSUM 2
-#define DO_RESOURCE 3
-#define DO_STOP 4
-#define DO_FAILED 5
+#define DO_TIMEOUT 0
+#define DO_RESTART 1
+#define DO_CHECKSUM 2
+#define DO_RESOURCE 3
+#define DO_STOP 4
+#define DO_FAILED 5
+#define DO_TIMESTAMP 6
/* Private Prototypes */
@@ -49,11 +50,11 @@
/* Private Variables */
static char desc[][STRLEN]= {"timed out", "restarted", "checksum error",
"matches resource limitation", "stopped",
- "failed"};
+ "failed", "timestamp error"};
static char desclog[][STRLEN]= {"Timeout", "Restart", "Checksum error",
"Resource limit matched", "Stop",
- "Failed"};
+ "Failed", "Timestamp error"};
/**
@@ -173,6 +174,22 @@
}
+/**
+ * Send an alert timestamp message to the email address for this process
+ * @param p A process_t object
+ * @param m An optional message string. May be NULL.
+ */
+void smtp_alert_timestamp(Process_T p, char *m, ...) {
+
+ va_list ap;
+
+ va_start(ap, m);
+ smtp_alert(p, DO_TIMESTAMP, m, ap);
+ va_end(ap);
+
+}
+
+
/* ----------------------------------------------------------------- Private */
diff -Naur monit/alert.h monit.cvs-20021127/alert.h
--- monit/alert.h 2002-11-03 22:51:57.000000000 +0100
+++ monit.cvs-20021127/alert.h 2002-11-27 22:27:20.000000000 +0100
@@ -40,5 +40,6 @@
void smtp_alert_resource(Process_T, char *, ...);
void smtp_alert_stop(Process_T, char *, ...);
void smtp_alert_failed(Process_T, char *, ...);
+void smtp_alert_timestamp(Process_T, char *, ...);
#endif
diff -Naur monit/files.c monit.cvs-20021127/files.c
--- monit/files.c 2002-11-18 22:54:05.000000000 +0100
+++ monit.cvs-20021127/files.c 2002-11-27 22:31:00.000000000 +0100
@@ -46,15 +46,12 @@
#include "monitor.h"
-/* Private variables */
-static time_t rc_last_modified;
-
-
/**
* Utilities used for managing files used by monit.
*
* @author Jan-Henrik Haukeland, <address@hidden>
* @author Christian Hopp, <address@hidden>
+ * @author Martin Pala, <address@hidden>
*
* @version \$Id: files.c,v 1.17 2002/11/18 12:51:56 chopp Exp $
*
@@ -91,7 +88,7 @@
}
- set_file_timestamp();
+ Run.timestamp= get_timestamp(Run.controlfile, S_IFREG);
}
@@ -111,26 +108,49 @@
*/
void re_init_files() {
- set_file_timestamp();
+ Run.timestamp= get_timestamp(Run.controlfile, S_IFREG);
}
/**
- * Set the Runtime control file's last modified timestamp
-*/
-void set_file_timestamp() {
+ * Get a object's last modified timestamp.
+ * @param object A object to stat
+ * @param type Requested object's type
+ * @return Max of either st_mtime or st_ctime or
+ * FALSE if not found or different type of object
+ */
+time_t get_timestamp(char *object, mode_t type) {
struct stat buf;
- if ( !stat(Run.controlfile, &buf) ) {
-
- rc_last_modified= buf.st_mtime;
-
+ ASSERT(object);
+
+ if( !stat(object, &buf) ) {
+
+ if( ((type == S_IFREG) && S_ISREG(buf.st_mode)) ||
+ ((type == S_IFDIR) && S_ISDIR(buf.st_mode)) ||
+ ((type == (S_IFREG|S_IFDIR)) && (S_ISREG(buf.st_mode) ||
S_ISDIR(buf.st_mode)))
+ ) {
+
+ return MAXIMUM(buf.st_mtime, buf.st_ctime);
+
+ } else {
+
+ error("%s: Invalid object type - %s\n", prog, object);
+
+ }
+
+ } else {
+
+ error("%s: Cannot stat object - %s\n", prog, object);
+
}
+ return FALSE;
+
}
-
+
/**
* Search the system for the monit control file. Try first
@@ -211,38 +231,8 @@
*/
int is_rcfile_changed() {
- struct stat buf;
-
- if ( !stat(Run.controlfile, &buf) ) {
-
- return (buf.st_mtime != rc_last_modified);
-
- }
-
- return FALSE;
-
-}
-
-
-/**
- * Get a files last modified timestamp. This function returns the max
- * of either st_mtime or st_ctime. If the file does not exist or is
- * not a regular file FALSE is returned
- * @param file A file to stat
- * @return last modification time or FALSE if not found or not a regular file
-*/
-time_t file_changedtime(char *file) {
-
- struct stat buf;
-
- ASSERT(file);
-
- if(!stat(file, &buf))
- if(S_ISREG(buf.st_mode))
- return MAXIMUM(buf.st_mtime, buf.st_ctime);
+ return( get_timestamp(Run.controlfile, S_IFREG) != Run.timestamp );
- return FALSE;
-
}
@@ -352,7 +342,7 @@
if(( buf.st_mode & 0777 ) & ~permmask) {
/*
- Expaination:
+ Explanation:
buf.st_mode & 0777 -> We just want to check the
permissions not the file type...
diff -Naur monit/gc.c monit.cvs-20021127/gc.c
--- monit/gc.c 2002-09-29 21:25:39.000000000 +0200
+++ monit.cvs-20021127/gc.c 2002-11-27 22:27:20.000000000 +0100
@@ -33,12 +33,14 @@
static void _gcpcl(Checksum_T*);
static void _gcpql(Resource_T*);
static void _gcppil(ProcInfo_T*);
+static void _gcptl(Timestamp_T*);
/**
* Release allocated memory.
*
* @author Jan-Henrik Haukeland, <address@hidden>
+ * @author Martin Pala, <address@hidden>
*
* @version \$Id: gc.c,v 1.9 2002/09/27 09:52:29 chopp Exp $
*
@@ -92,6 +94,12 @@
}
+ if((*p)->timestamplist) {
+
+ _gcptl(&(*p)->timestamplist);
+
+ }
+
free((*p)->name);
free((*p)->pidfile);
free((*p)->group);
@@ -197,3 +205,19 @@
*pi= NULL;
}
+
+
+static void _gcptl(Timestamp_T *p) {
+
+ if((*p)->next) {
+
+ _gcptl(&(*p)->next);
+
+ }
+
+ free((*p)->pathname);
+ free(*p);
+ *p= NULL;
+
+}
+
diff -Naur monit/http/cervlet.c monit.cvs-20021127/http/cervlet.c
--- monit/http/cervlet.c 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/http/cervlet.c 2002-11-27 22:27:20.000000000 +0100
@@ -613,6 +613,16 @@
out_print(res, "<tr><td>Associated checksum</a></td><td>%s %s</td>"
"</tr>", c->md5, c->file);
}
+ {
+ struct mytimestamp *t;
+ for(t= p->timestamplist; t; t= t->next)
+ out_print(res,
+ "<tr>
+ <td>Associated timestamp</a></td>
+ <td>If %s %s %d second(s) then %s</td>"
+ "</tr>",
+ t->pathname, operatornames[t->operator], t->time,
actionnames[t->action]);
+ }
out_print(res,
"<tr><td>Timeout</a></td><td>Timeout if %d restart within %d cycles"
"</td></tr>", p->to_start, p->to_cycle);
@@ -646,6 +656,9 @@
out_print(res,
"<tr><td>Alert on stop</a></td><td>%s</td></tr>",
r->alert_on_stop?"yes":"no");
+ out_print(res,
+ "<tr><td>Alert on timestamp</a></td><td>%s</td></tr>",
+ r->alert_on_timestamp?"yes":"no");
}
}
out_print(res,
diff -Naur monit/l.l monit.cvs-20021127/l.l
--- monit/l.l 2002-11-25 20:54:47.000000000 +0100
+++ monit.cvs-20021127/l.l 2002-11-27 22:27:20.000000000 +0100
@@ -152,6 +152,11 @@
cpuusage { return CPUUSAGE; }
memusage { return MEMUSAGE; }
memkbyte { return MEMKBYTE; }
+timestamp { return TIMESTAMP; }
+second(s)? { return SECOND; }
+minute(s)? { return MINUTE; }
+hour(s)? { return HOUR; }
+day(s)? { return DAY; }
{loadavg1} { return LOADAVG1; }
{loadavg5} { return LOADAVG5; }
diff -Naur monit/monit.pod monit.cvs-20021127/monit.pod
--- monit/monit.pod 2002-11-25 20:54:47.000000000 +0100
+++ monit.cvs-20021127/monit.pod 2002-11-27 22:33:52.000000000 +0100
@@ -284,14 +284,21 @@
=head1 ALERT MESSAGES
-monit will send an email alert if a program timed out, if monit
-restarted or stopped a program, a resource statement matches
-(see also the section RESOURCE TESTING below) or if a checksum
-error occurred (see also the section MD5 CHECKSUM below). More
-than one alert statement can be used in a process entry. This
-means that you can send different emails to different addresses.
-The full syntax for the alert statement is as follows:
-(keywords are in capital)
+monit will send an email alert if:
+
+ o The program timed out
+ o Monit restarted the program
+ o Monit stopped the program
+ o Timestamp test didn't passed
+ o Resource statement matches (see also the section RESOURCE
+ TESTING below)
+ o Checksum error occurred (see also the section MD5 CHECKSUM
+ below)
+
+More than one alert statement can be used in a process entry.
+This means that you can send different emails to different
+addresses. The full syntax for the alert statement is as
+follows (keywords are in capital):
ALERT mail-address [{events}] [MAIL-FORMAT {mail-format}]
@@ -300,10 +307,11 @@
alert address@hidden
will send a default email alert to the address address@hidden whenever a
-timeout, restart, checksum, stop or resource error occurs.
+timeout, restart, checksum, resource, stop or timestamp error
+occurs.
If you only want an alert message sent when a certain event
-occurs for example a timeout or when a program is restarted;
+occurs for example a timeout or when a program is restarted,
postfix the alert-statement respectively
alert address@hidden only on { timeout } or
@@ -333,15 +341,21 @@
The following alert-statement:
- alert address@hidden { timeout, restart, checksum, resource, stop }
+ alert address@hidden { timeout
+ restart
+ checksum
+ resource
+ stop
+ timestamp }
is equivalent to:
alert address@hidden
which as stated above, will send a message when a timeout, a
-restart or a checksum error occurs. (If the post fix variant is
-used, then note that the parenthesis are I<mandatory>).
+restart, checksum, resource, stop or timestamp error occurs.
+(If the post fix variant is used, then note that the parenthesis
+are I<mandatory>).
A restart alert is also sent I<if monit fails to execute> the
start or the stop program for an entry. B<It is therefor strongly
@@ -474,6 +488,7 @@
timeout lock in the daemon and make the daemon start and check
the program again.
+
=head1 RESOURCE TESTING
Monit can examine how much system resources a service or the
@@ -510,13 +525,15 @@
I<operator> is a choice of "<",">","!=","==" in c notation, "gt",
"lt", "eq", "ne" in shell sh notation and "greater", "less",
-"equal", "notequal" in human readable form.
+"equal", "notequal" in human readable form (if not specified,
+default is EQUAL).
I<cycles> is the maximum number of cycles the expression above
-has to be true in order to start an action. If I<cylces> is
+has to be true in order to start an action. If I<cycles> is
omitted then it is set to one.
-I<action> is a choice of "ALERT", "RESTART", "STOP".
+I<action> is a choice of "ALERT", "RESTART", "STOP" (if not
+specified, default is ALERT):
ALERT sends the user a resource alert in case the maximum number
of cycles has been reached.
@@ -549,6 +566,70 @@
See also the example section below.
+=head1 TIMESTAMP TESTING
+
+Monit can watch timestamp of any file or directory associated
+with monitored program.
+
+Full syntax for the timestamp statements is as follows (keywords
+are in capital and optional statements in [brackets]):
+
+ TIMESTAMP object [operator] value [unit] [action]
+
+I<object> is path to the associated file or directory to watch.
+
+I<operator> is a choice of "<",">","!=","==" in c notation, "GT",
+"LT", "EQ", "NE" in shell sh notation and "GREATER", "LESS",
+"EQUAL", "NOTEQUAL" in human readable form (if not specified,
+default is EQUAL).
+
+I<value> is time watermark.
+
+I<unit> is either "SECOND", "MINUTE", "HOUR" or "DAY" (it is also
+possible to use "SECONDS", "MINUTES", "HOURS", or "DAYS").
+
+I<action> is a choice of "ALERT", "RESTART", "STOP" (if not
+specified, default is ALERT):
+
+ o ALERT sends the user a timestamp alert.
+
+ o RESTART restarts the service.
+
+ o STOP stops the service. If monit stops a service it will not
+ be checked by monit anymore nor restarted again later. You
+ must explicit start it again from the web interface or from
+ the console, like: 'monit start apache' if you want the monit
+ daemon to monitor the service again.
+
+It is useful for example for monitoring processes, that are able
+to report its task state by changing timestamp of state files
+such as iPlanet Messaging server stored process. It updates
+timestamp of:
+
+ o stored.ckp
+ o stored.lcu
+ o stored.per
+
+whenever it runs appropriate task. If the task failed, it keeps
+timestamp.
+
+To report stored problems you can use following statements:
+
+ timestamp "/ip/msg-foo/config/stored.ckp" > 1 minute then alert
+ timestamp "/ip/msg-foo/config/stored.lcu" > 5 minutes then alert
+ timestamp "/ip/msg-foo/config/stored.per" > 1 hour then alert
+
+or equivalent less verbose form:
+
+ timestamp "/ip/msg-foo/config/stored.ckp" > 60
+ timestamp "/ip/msg-foo/config/stored.lcu" > 300
+ timestamp "/ip/msg-foo/config/stored.per" > 3600
+
+As mentioned, you can use it also for monitoring some directory
+for changes. If files are added or removed to/from directory, its
+timestamp is changed.
+
+
=head1 CONNECTION TESTING
Monit is able to perfom connection testing via networked ports
@@ -1036,21 +1117,25 @@
is the cycle interval to test restarts.
This statement is optional
alert Specifies an email address for notification
- if checksum, timeout, stop or restart occurs.
- Alert can also be postfixed, to only send a
- message for certain events. See the examples
- above. More than one alert statement is allowed
- in an entry. This statement is also optional.
+ if checksum, timeout, restart, stop or timestamp
+ occurs. Alert can also be postfixed, to only
+ send a message for certain events. See the
+ examples above. More than one alert statement
+ is allowed in an entry. This statement is also
+ optional.
mail-format Specifies a mail format for an alert message
This statement is an optional part of the
alert statement.
checksum Specify that monit should verify a checksum
for associated files.
- More than one checksum statement are allowed
+ More than one checksum statement are allowed.
expect Specifies a checksum string (md5) monit
should use when testing the checksum. This
statement is an optional part of the
checksum statement.
+ timestamp Specifies expected timestamp for given object
+ and optional action. More than one timeout
+ statement are allowed.
every Validate this entry only at every n poll
cycle. Usefull in daemon mode when the
poll-cycle is short and the program takes
@@ -1100,8 +1185,8 @@
I<timeout>, I<checksum>, I<resource>, I<expect>, I<mailserver>,
I<every>, I<mode>, I<active>, I<passive>, I<manual>, I<host>,
I<default>, I<http>, I<ftp>, I<smtp>, I<pop>, I<nntp>, I<imap>,
-I<ssh>, I<request>, I<cpuusage>, I<memusage>, I<memkbyte> and
-I<loadavg>.
+I<ssh>, I<request>, I<cpuusage>, I<memusage>, I<memkbyte>, I<loadavg>,
+I<timestamp>, I<second(s)>, I<minute(s)>, I<hour(s)> and I<day(s)>.
And here is a complete list of B<noise keywords> ignored by
monit:
diff -Naur monit/monit_process.c monit.cvs-20021127/monit_process.c
--- monit/monit_process.c 2002-11-03 22:50:30.000000000 +0100
+++ monit.cvs-20021127/monit_process.c 2002-11-27 22:27:20.000000000 +0100
@@ -73,15 +73,14 @@
/* ------------------------------------------------------------------ Public */
+char actionnames[][STRLEN]= {"ignore", "alert", "restart", "stop"};
+char modenames[][STRLEN]= {"active", "passive", "manual"};
+char operatorshortnames[][3]= {">", "<", "=", "!="};
+char operatornames[][STRLEN]= {"greater than", "less than", "equal to",
+ "not equal to"};
-int include_children= TRUE;
-char actionnames[][STRLEN]={"ignore", "alert", "restart", "stop"};
-char operatornames[][STRLEN]={"greater than", "less than",
- "equal to", "not equal to"};
-char operatorshortnames[][3]={">", "<", "=", "!="};
-char modenames[][STRLEN]={"active", "passive", "manual"};
-
-int num_cpus=1;
+int include_children= TRUE;
+int num_cpus=1;
/**
@@ -119,73 +118,6 @@
/**
- * Compare a value within a resource list member
- * @param value the value to compare
- * @param q resouce list member
- * @return result of comparison, either TRUE or FALSE
- */
-int compare_resource(int value, Resource_T q) {
-
- switch ( q->operator ) {
-
- case RESOURCE_OPERATOR_GREATER:
- if ( value > q->limit ) {
-
- return TRUE;
-
- } else {
-
- return FALSE;
-
- }
-
- case RESOURCE_OPERATOR_LESS:
- if ( value < q->limit ) {
-
- return TRUE;
-
- } else {
-
- return FALSE;
-
- }
-
- case RESOURCE_OPERATOR_EQUAL:
- if ( value == q->limit ) {
-
- return TRUE;
-
- } else {
-
- return FALSE;
-
- }
-
- case RESOURCE_OPERATOR_NOTEQUAL:
- if ( value != q->limit ) {
-
-
- return TRUE;
-
- } else {
-
- return FALSE;
-
- }
-
- default:
-
- log("Unknow resource comparison operator.\n");
-
- /* Internal Alert ??? */
-
- return FALSE;
-
- }
-}
-
-
-/**
* Updates the loadavg list
* @return TRUE if successful, otherwise FALSE
*/
diff -Naur monit/monit_process.h monit.cvs-20021127/monit_process.h
--- monit/monit_process.h 2002-11-03 22:51:57.000000000 +0100
+++ monit.cvs-20021127/monit_process.h 2002-11-27 22:27:20.000000000 +0100
@@ -26,15 +26,13 @@
#include <unistd.h>
#endif
-#include "monitor.h"
-
#define PROCESS_ZOMBIE 1
extern int include_children;
extern char actionnames[][STRLEN];
+extern char modenames[][STRLEN];
extern char operatornames[][STRLEN];
extern char operatorshortnames[][3];
-extern char modenames[][STRLEN];
extern int num_cpus;
#define RESOURCE_ID_CPU_PERCENT 1
@@ -44,18 +42,8 @@
#define RESOURCE_ID_LOAD5 5
#define RESOURCE_ID_LOAD15 6
-#define RESOURCE_ACTION_ALERT 1
-#define RESOURCE_ACTION_RESTART 2
-#define RESOURCE_ACTION_STOP 3
-
-#define RESOURCE_OPERATOR_GREATER 0
-#define RESOURCE_OPERATOR_LESS 1
-#define RESOURCE_OPERATOR_EQUAL 2
-#define RESOURCE_OPERATOR_NOTEQUAL 3
-
int update_process_data(Process_T p, pid_t pid);
int init_process_info(void);
-int compare_resource(int, Resource_T);
int update_loadavg(void);
#endif
diff -Naur monit/monitor.c monit.cvs-20021127/monitor.c
--- monit/monitor.c 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/monitor.c 2002-11-27 22:27:20.000000000 +0100
@@ -243,9 +243,6 @@
}
- /* Re-Initialize Runtime file variables */
- re_init_files();
-
}
diff -Naur monit/monitor.h monit.cvs-20021127/monitor.h
--- monit/monitor.h 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/monitor.h 2002-11-27 22:27:20.000000000 +0100
@@ -71,6 +71,20 @@
#define MODE_PASSIVE 1
#define MODE_MANUAL 2
+#define OPERATOR_GREATER 0
+#define OPERATOR_LESS 1
+#define OPERATOR_EQUAL 2
+#define OPERATOR_NOTEQUAL 3
+
+#define TIME_SECOND 1
+#define TIME_MINUTE 60
+#define TIME_HOUR 3600
+#define TIME_DAY 86400
+
+#define ACTION_ALERT 1
+#define ACTION_RESTART 2
+#define ACTION_STOP 3
+
#define MAXIMUM(x,y) ((x) > (y) ? (x) : (y))
#define is(a,b) ((a&&b)?!strcasecmp(a, b):0)
@@ -125,6 +139,7 @@
int doprocess; /**< TRUE if process status engine is used */
char *bind_addr; /**< The address monit http will bind to */
mode_t umask; /**< The initial umask monit was started with */
+ time_t timestamp; /**< Actual configuration timestamp */
double loadavg[3]; /**< Load average triple */
@@ -204,6 +219,7 @@
int alert_on_checksum; /**< If TRUE, alert user when the checksum fail */
int alert_on_resource; /**< If TRUE, alert user when resources exceed */
int alert_on_stop; /**< If TRUE, alert user when process stopped */
+ int alert_on_timestamp; /**< If TRUE, alert user when the timestamp fail */
/** For internal use */
struct mymail *next; /**< next recipient in chain */
@@ -216,11 +232,21 @@
int operator; /**< Comparison operator */
int cycle; /**< Cycle overrun counter */
int max_cycle; /**< Cycle overrun limit */
- int action; /**< Action in case of failiure */
+ int action; /**< Action in case of failure */
struct myresource *next;
} *Resource_T;
+/** Defines timestamp object */
+typedef struct mytimestamp {
+ char *pathname; /**< Path to the object */
+ int operator; /**< Comparison operator */
+ int time; /**< Timestamp watermark */
+ int action; /**< Action in case of failure */
+
+ struct mytimestamp *next;
+} *Timestamp_T;
+
/** Defines procfs (or other mechanism) data */
typedef struct myprocinfo {
int pid;
@@ -272,6 +298,7 @@
Mail_T maillist; /**< Alert notification mailinglist */
Resource_T resourcelist; /**< Resouce check list */
Dependant_T dependantlist; /**<Dependant process list */
+ Timestamp_T timestamplist; /**< Timestamp check list */
ProcInfo_T procinfo; /**< Data for the procfs check */
@@ -309,7 +336,6 @@
int isreg_file(char *);
char *stripfilename(char*);
int exist_file(char *);
-time_t file_changedtime(char *file);
char *get_ctime();
char *get_RFC1123date(long *);
char *get_uptime(long delta);
@@ -338,7 +364,7 @@
void gc_mail_list(Mail_T*);
void init_files();
void re_init_files();
-void set_file_timestamp();
+time_t get_timestamp(char *, mode_t);
void finalize_files();
char *find_rcfile();
int create_pidfile(char *);
diff -Naur monit/monitrc monit.cvs-20021127/monitrc
--- monit/monitrc 2002-11-18 22:54:05.000000000 +0100
+++ monit.cvs-20021127/monitrc 2002-11-27 22:38:49.000000000 +0100
@@ -95,6 +95,11 @@
# testing a particular file's checksum. This statement is
# an optional part of the checksum statement.
#
+# timestamp -- Must be followed by a file or directory with an
+# absolute path, compare operator, number, optional time
+# unit and action. This statement is optional. More than
+# one timestamp statement is allowed.
+#
# every -- Only check the program at every n cycles. Optional.
#
# mode -- Must be followed either by the keyword active, passive
diff -Naur monit/p.y monit.cvs-20021127/p.y
--- monit/p.y 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/p.y 2002-11-27 22:27:20.000000000 +0100
@@ -86,6 +86,7 @@
int checksum;
int resource;
int stop;
+ int timestamp;
};
struct PortSet {
@@ -109,6 +110,13 @@
int action;
};
+ struct TimestampSet {
+ char *pathname;
+ int operator;
+ int time;
+ int action;
+ };
+
/* yacc interface */
void yyerror(const char *);
@@ -125,11 +133,11 @@
static Command_T command= NULL;
static struct IHavePrecedence ihp= { FALSE, FALSE };
static struct MailFilter mtf= { NULL, NULL, NULL, FALSE, FALSE, FALSE,
- FALSE, FALSE };
+ FALSE, FALSE, FALSE };
static struct PortSet portset= { -1, NULL, 0, SOCK_STREAM, AF_INET, FALSE,
NULL, NULL, NULL, NULL };
- static struct ResourceSet resourceset= { 0, 0, RESOURCE_OPERATOR_GREATER,
- 1, RESOURCE_ACTION_ALERT };
+ static struct ResourceSet resourceset= { 0, 0, OPERATOR_EQUAL, 1,
ACTION_ALERT };
+ static struct TimestampSet timestampset= { NULL, OPERATOR_EQUAL, 0,
ACTION_ALERT };
/* Private prototypes */
static void initialize();
@@ -140,6 +148,7 @@
static void addchecksum(char *, char *);
static void addport(struct PortSet *);
static void addresource(struct ResourceSet *);
+ static void addtimestamp(struct TimestampSet *);
static void *addprotocol(int);
static void addcommand(int);
static void addargument(char *);
@@ -151,6 +160,7 @@
static void reset_runmail();
static void reset_portset();
static void reset_resourceset();
+ static void reset_timestampset();
static void check_name(char *);
static void check_timeout(int, int);
static void check_every (int);
@@ -180,6 +190,7 @@
%token CPUUSAGE MEMUSAGE MEMKBYTE RESOURCE LOADAVG1 LOADAVG5 LOADAVG15
%token AUTOSTART YES NO MODE ACTIVE PASSIVE MANUAL
%token GROUP REQUEST DEPEND
+%token TIMESTAMP SECOND MINUTE HOUR DAY
%nonassoc CHECKSUM
%nonassoc START
@@ -219,6 +230,9 @@
| unixsocket type protocol {
addport(&portset);
}
+ | timestamp {
+ addtimestamp(×tampset);
+ }
| timeout
| every
| alert
@@ -227,8 +241,9 @@
| mode
| group
| depend
- | resource resourcecycle resourceaction {
- addresource(&resourceset);
+ | resource resourcecycle action {
+ resourceset.action= $<number>3;
+ addresource(&resourceset);
}
;
@@ -462,11 +477,12 @@
| alertoptionlist alertoption
;
-alertoption : TIMEOUT { mtf.timeout= TRUE; }
- | RESTART { mtf.restart= TRUE; }
- | CHECKSUM { mtf.checksum= TRUE; }
- | RESOURCE { mtf.resource= TRUE; }
- | STOP { mtf.stop= TRUE; }
+alertoption : TIMEOUT { mtf.timeout= TRUE; }
+ | RESTART { mtf.restart= TRUE; }
+ | CHECKSUM { mtf.checksum= TRUE; }
+ | RESOURCE { mtf.resource= TRUE; }
+ | STOP { mtf.stop= TRUE; }
+ | TIMESTAMP { mtf.timestamp= TRUE; }
;
formatlist : /* EMPTY */
@@ -526,29 +542,33 @@
dependant : STRING { adddependant($1); }
;
-resource : CPUUSAGE resourceop REAL {
+resource : CPUUSAGE operator REAL {
resourceset.resource_id= RESOURCE_ID_CPU_PERCENT;
+ resourceset.operator= $<number>2;
resourceset.limit= (int) ($3 * 10.0);
}
- | CPUUSAGE resourceop NUMBER {
+ | CPUUSAGE operator NUMBER {
yyerror("Expecting a real number (e.g. 10.0) Got");
}
- | MEMUSAGE resourceop REAL {
+ | MEMUSAGE operator REAL {
resourceset.resource_id= RESOURCE_ID_MEM_PERCENT;
+ resourceset.operator= $<number>2;
resourceset.limit= (int) ($3 * 10.0);
}
- | MEMUSAGE resourceop NUMBER {
+ | MEMUSAGE operator NUMBER {
yyerror("Expecting a real number (e.g. 15.0) Got");
}
- | MEMKBYTE resourceop NUMBER {
+ | MEMKBYTE operator NUMBER {
resourceset.resource_id= RESOURCE_ID_MEM_KBYTE;
+ resourceset.operator= $<number>2;
resourceset.limit= (int) $3;
}
- | resourceload resourceop REAL {
+ | resourceload operator REAL {
resourceset.resource_id= $<number>1;
+ resourceset.operator= $<number>2;
resourceset.limit= (int) ($3 * 10.0);
}
- | resourceload resourceop NUMBER {
+ | resourceload operator NUMBER {
yyerror("Expecting a real number (e.g. 5.0) Got");
}
;
@@ -558,20 +578,36 @@
| LOADAVG15 { $<number>$= RESOURCE_ID_LOAD15; }
;
-resourceop : /* EMPTY */ { resourceset.operator= RESOURCE_OPERATOR_EQUAL;}
- | GREATER { resourceset.operator= RESOURCE_OPERATOR_GREATER;}
- | LESS { resourceset.operator= RESOURCE_OPERATOR_LESS; }
- | EQUAL { resourceset.operator= RESOURCE_OPERATOR_EQUAL;}
- | NOTEQUAL { resourceset.operator= RESOURCE_OPERATOR_NOTEQUAL;}
- ;
-
resourcecycle : /* EMPTY */
| NUMBER { resourceset.max_cycle= $1; }
;
-resourceaction : ALERT { resourceset.action = RESOURCE_ACTION_ALERT; }
- | RESTART { resourceset.action = RESOURCE_ACTION_RESTART; }
- | STOP { resourceset.action = RESOURCE_ACTION_STOP; }
+timestamp : TIMESTAMP PATH operator NUMBER time action {
+ timestampset.pathname= $2;
+ timestampset.operator= $<number>3;
+ timestampset.time= ($4 * $<number>5);
+ timestampset.action= $<number>6;
+ }
+ ;
+
+operator : /* EMPTY */ { $<number>$= OPERATOR_EQUAL; }
+ | GREATER { $<number>$= OPERATOR_GREATER; }
+ | LESS { $<number>$= OPERATOR_LESS; }
+ | EQUAL { $<number>$= OPERATOR_EQUAL; }
+ | NOTEQUAL { $<number>$= OPERATOR_NOTEQUAL; }
+ ;
+
+time : /* EMPTY */ { $<number>$= TIME_SECOND; }
+ | SECOND { $<number>$= TIME_SECOND; }
+ | MINUTE { $<number>$= TIME_MINUTE; }
+ | HOUR { $<number>$= TIME_HOUR; }
+ | DAY { $<number>$= TIME_DAY; }
+ ;
+
+action : /* EMPTY */ { $<number>$= ACTION_ALERT; }
+ | ALERT { $<number>$= ACTION_ALERT; }
+ | RESTART { $<number>$= ACTION_RESTART; }
+ | STOP { $<number>$= ACTION_STOP; }
;
%%
@@ -784,8 +820,8 @@
/*
* Add the given mailaddress with the apropriate alert notification
- * mask { TIMEOUT RESTART CHECKSUM } and mail attributes to the
- * current process's mailinglist.
+ * mask { TIMEOUT RESTART CHECKSUM RESOURCE STOP TIMESTAMP } and mail
+ * attributes to the current process's mailinglist.
*/
static void addmail(char *mailto, struct MailFilter *f) {
@@ -793,13 +829,19 @@
ASSERT(mailto);
- if ( f->timeout || f->restart || f->checksum ) {
+ if ( f->timeout ||
+ f->restart ||
+ f->checksum ||
+ f->resource ||
+ f->stop ||
+ f->timestamp ) {
m->alert_on_timeout= f->timeout;
m->alert_on_restart= f->restart;
m->alert_on_checksum= f->checksum;
m->alert_on_resource= f->resource;
m->alert_on_stop= f->stop;
+ m->alert_on_timestamp= f->timestamp;
} else {
@@ -808,6 +850,7 @@
m->alert_on_checksum= TRUE;
m->alert_on_resource= TRUE;
m->alert_on_stop= TRUE;
+ m->alert_on_timestamp= TRUE;
}
@@ -930,9 +973,30 @@
/*
- * Adds procfs check data to current process procinfo
+ * Add a new file object to the current process timestamp list
*/
+static void addtimestamp(struct TimestampSet *ts) {
+
+ Timestamp_T t= NEW(t);
+
+ ASSERT(ts);
+
+ t->pathname= ts->pathname;
+ t->operator= ts->operator;
+ t->time= ts->time;
+ t->action= ts->action;
+ t->next= current->timestamplist;
+ current->timestamplist= t;
+
+ reset_timestampset();
+
+}
+
+
+/*
+ * Adds procfs check data to current process procinfo
+ */
static void createprocinfo() {
ProcInfo_T pi= NEW(pi);
@@ -1119,6 +1183,7 @@
mtf.checksum= FALSE;
mtf.resource= FALSE;
mtf.stop= FALSE;
+ mtf.timestamp= FALSE;
}
@@ -1150,8 +1215,8 @@
resourceset.resource_id= 0;
resourceset.limit= 0;
resourceset.max_cycle= 1;
- resourceset.action= RESOURCE_ACTION_ALERT;
- resourceset.operator= RESOURCE_OPERATOR_GREATER;
+ resourceset.action= ACTION_ALERT;
+ resourceset.operator= OPERATOR_EQUAL;
}
@@ -1171,6 +1236,19 @@
}
+/*
+ * Reset the Timestamp set to default values
+ */
+static void reset_timestampset() {
+
+ timestampset.pathname= NULL;
+ timestampset.operator= OPERATOR_EQUAL;
+ timestampset.time= 0;
+ timestampset.action= ACTION_ALERT;
+
+}
+
+
/* ---------------------------------------------------------------- Checkers */
diff -Naur monit/util.c monit.cvs-20021127/util.c
--- monit/util.c 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/util.c 2002-11-27 22:27:20.000000000 +0100
@@ -430,6 +430,7 @@
Mail_T r;
Resource_T q;
Checksum_T c;
+ Timestamp_T t;
ASSERT(p);
@@ -447,12 +448,6 @@
}
- if(! p->checksumlist) {
-
- printf(" %-20s = (not defined)\n", "Other checksum");
-
- }
-
if(! p->portlist) {
printf(" %-20s = (not defined)\n", "Host:Port");
@@ -496,6 +491,17 @@
}
+ for(t= p->timestamplist; t; t= t->next) {
+
+ printf(" %-20s = if %s %s %d second(s) then %s\n",
+ "Timestamp",
+ t->pathname,
+ operatornames[t->operator],
+ t->time,
+ actionnames[t->action]);
+
+ }
+
if(p->def_timeout) {
printf(" %-20s = Do timeout if %d restart within %d cycles\n",
@@ -526,6 +532,8 @@
r->alert_on_resource?"yes":"no");
printf(" %-18s = %s\n", "alert on stop",
r->alert_on_stop?"yes":"no");
+ printf(" %-18s = %s\n", "alert on timestamp",
+ r->alert_on_timestamp?"yes":"no");
}
@@ -696,7 +704,7 @@
if((kill_return= getpgid(pid)) > 0 || errno == EPERM)
- return pid;
+ return pid;
}
@@ -779,9 +787,7 @@
ASSERT(pidfile);
- ctime = (pidfile?file_changedtime(pidfile):0);
-
- if(ctime) {
+ if( (ctime= get_timestamp(pidfile, S_IFREG)) ) {
time_t now= time(&now);
time_t since= now-ctime;
@@ -846,7 +852,7 @@
ASSERT(file);
if(! (*dest= get_md5sum(file)))
- return FALSE;
+ return FALSE;
return TRUE;
diff -Naur monit/validate.c monit.cvs-20021127/validate.c
--- monit/validate.c 2002-11-26 21:16:37.000000000 +0100
+++ monit.cvs-20021127/validate.c 2002-11-27 22:27:20.000000000 +0100
@@ -56,9 +56,11 @@
static int check_timeout(Process_T);
static int check_checksum(Process_T);
static int checksum_helper(Process_T, char *, char *);
+static int check_timestamp(Process_T, Timestamp_T, char *);
static void connection_timeout(int);
static void reset_resource_counter(Process_T);
static void vlog(char * report, int n, Process_T p, char *m,...);
+static int compare_value(int, int, int);
/**
* Check that all processes in the process list are running and
@@ -117,6 +119,7 @@
Port_T pp;
Resource_T pr;
+ Timestamp_T tl;
pid_t pid= -1;
sigset_t ns,os;
char report[STRLEN];
@@ -169,20 +172,20 @@
if ( !check_resources(p, pr, report) ) {
switch (pr->action) {
- case RESOURCE_ACTION_ALERT:
+ case ACTION_ALERT:
smtp_alert_resource(p, "Reason: %s\n", report);
/* We are also interested in other alerts/stops/restarts! */
pr->cycle=0;
break;
- case RESOURCE_ACTION_STOP:
+ case ACTION_STOP:
do_stop(p, "Reason: %s\n", report);
reset_resource_counter(p);
goto reinstall;
- case RESOURCE_ACTION_RESTART:
+ case ACTION_RESTART:
do_restart(p, "Reason: %s\n", report);
reset_resource_counter(p);
@@ -211,6 +214,36 @@
}
+ for( tl= p->timestamplist; tl; tl= tl->next ) {
+
+ if ( !check_timestamp(p, tl, report) ) {
+
+ switch (tl->action) {
+ case ACTION_ALERT:
+ smtp_alert_timestamp(p, "Reason: %s\n", report);
+ break; /* continue */
+
+ case ACTION_STOP:
+
+ do_stop(p, "Reason: %s\n", report);
+ goto reinstall;
+
+ case ACTION_RESTART:
+
+ do_restart(p, "Reason: %s\n", report);
+ goto reinstall;
+
+ default:
+
+ log("'%s' Unknow timestamp failure action.\n", p->name);
+ break;
+
+ }
+
+ }
+
+ }
+
/* Test each host:port and protocol in the process's portlist */
for(pp= p->portlist; pp; pp= pp->next) {
@@ -568,7 +601,7 @@
switch (pr->resource_id) {
case RESOURCE_ID_CPU_PERCENT:
- if ( compare_resource(pi->cpu_percent, pr) ) {
+ if ( compare_value(pr->operator, pi->cpu_percent, pr->limit) ) {
vlog(report, STRLEN, p,
"cpu usage of %.1f%% matches resource limit [cpu usage%s%.1f%%]",
@@ -589,7 +622,7 @@
break;
case RESOURCE_ID_MEM_PERCENT:
- if ( compare_resource(pi->mem_percent, pr) ) {
+ if ( compare_value(pr->operator, pi->mem_percent, pr->limit) ) {
vlog(report, STRLEN, p,
"mem usage of %.1f%% matches resource limit [mem usage%s%.1f%%]",
@@ -611,7 +644,7 @@
case RESOURCE_ID_MEM_KBYTE:
- if ( compare_resource(pi->mem_kbyte, pr) ) {
+ if ( compare_value(pr->operator, pi->mem_kbyte, pr->limit) ) {
vlog(report, STRLEN, p,
"mem amount of %ldkB matches resource limit [mem amount%s%ldkB]",
@@ -633,7 +666,7 @@
case RESOURCE_ID_LOAD1:
- if ( compare_resource((int) (Run.loadavg[0]*10.0), pr) ) {
+ if ( compare_value(pr->operator, (int)(Run.loadavg[0]*10.0), pr->limit) ) {
vlog(report, STRLEN, p,
"loadavg(1min) of %.1f matches resource limit "
@@ -656,7 +689,7 @@
case RESOURCE_ID_LOAD5:
- if ( compare_resource((int) (Run.loadavg[1]*10.0), pr) ) {
+ if ( compare_value(pr->operator, (int)(Run.loadavg[1]*10.0), pr->limit) ) {
vlog(report, STRLEN, p,
"loadavg(5min) of %.1f matches resource limit "
@@ -679,7 +712,7 @@
case RESOURCE_ID_LOAD15:
- if ( compare_resource((int) (Run.loadavg[2]*10.0), pr) ) {
+ if ( compare_value(pr->operator, (int)(Run.loadavg[2]*10.0), pr->limit) ) {
vlog(report, STRLEN, p,
"loadavg(15min) of %.1f matches resource limit "
@@ -889,6 +922,37 @@
/**
+ * Returns TRUE if the timestamp test succeded, otherwise FALSE
+ */
+static int check_timestamp(Process_T p, Timestamp_T t, char *report) {
+
+ time_t now;
+ time_t timestamp;
+
+ if( (int)time(&now) == -1 ) {
+ vlog(report, STRLEN, p, "can't get actual time");
+ return FALSE;
+ }
+
+ if ( !(timestamp= get_timestamp(t->pathname, S_IFDIR|S_IFREG)) ) {
+ vlog(report, STRLEN, p, "can't get timestamp for %s", t->pathname);
+ return FALSE;
+ }
+
+ if( compare_value(t->operator, (int)(now - timestamp), t->time) ) {
+ vlog(report, STRLEN, p, "timestamp test failed for %s", t->pathname);
+ return FALSE;
+ }
+
+ if(Run.debug)
+ log("'%s' timestamp test passed for %s\n", p->name, t->pathname);
+
+ return TRUE;
+
+}
+
+
+/**
* Signal handler for connection timeout
*/
static void connection_timeout(int sig) {
@@ -921,3 +985,47 @@
free(tmp);
}
+
+
+/**
+ * Comparison of values. Returns TRUE if comparison matches, otherwise
+ * FALSE.
+ */
+static int compare_value(int operator, int left, int right) {
+
+ switch (operator) {
+
+ case OPERATOR_GREATER:
+
+ if( left > right )
+ return TRUE;
+ break;
+
+ case OPERATOR_LESS:
+
+ if( left < right )
+ return TRUE;
+ break;
+
+ case OPERATOR_EQUAL:
+
+ if( left == right )
+ return TRUE;
+ break;
+
+ case OPERATOR_NOTEQUAL:
+
+ if( left != right )
+ return TRUE;
+ break;
+
+ default:
+ error("Unknow comparison operator\n");
+ return FALSE;
+
+ }
+
+ return FALSE;
+
+}
+
Re: timestamp monitoring + code simplification patch, Martin Pala, 2002/11/29