Windows: define new event log messages for cm_Analyze VBUSY, VRESTARTING, etc.
authorJeffrey Altman <jaltman@your-file-system.com>
Fri, 16 Apr 2010 04:31:58 +0000 (00:31 -0400)
committerJeffrey Altman <jaltman@openafs.org>
Sat, 17 Apr 2010 03:06:14 +0000 (20:06 -0700)
Add MSG_SERVER_REPORTS_VBUSY, MSG_SERVER_REPORTS_VRESTARTING,
MSG_ALL_SERVERS_BUSY, MSG_ALL_SERVERS_OFFLINE,
and MSG_ALL_SERVERS_DOWN.

Add event message throttling.  Only permit one copy of a message
to be generated every five seconds if the message will duplicate
the prior message.  This often occurs when a server or volume becomes
inaccessible and there were a large number of requests queued on it.

Integrate these new messages into cm_Analyze processing for VBUSY,
VRESTARTING, ALLDOWN, ALLOFFLINE, and ALLBUSY errors.

LICENSE MIT

Change-Id: Ib30595ded23724810e294133fa75873e20ffbf0b
Reviewed-on: http://gerrit.openafs.org/1762
Tested-by: Jeffrey Altman <jaltman@openafs.org>
Reviewed-by: Jeffrey Altman <jaltman@openafs.org>

src/WINNT/afsd/afsd_eventlog.c
src/WINNT/afsd/afsd_eventmessages.mc
src/WINNT/afsd/cm_conn.c

index 58027fb..1122c7c 100644 (file)
@@ -157,17 +157,26 @@ LogEventMessage(WORD wEventType, DWORD dwEventID, DWORD dwMessageID)
 // Use the ReportEvent API to write an entry to the system event log.
 //
 #define MAXARGS 8
-#define STRLEN  64
+#define STRLEN  128
+
 VOID
 LogEvent(WORD wEventType, DWORD dwEventID, ...)
 {
     va_list    listArgs;
     HANDLE     hEventSource;
+    HANDLE      hMutex = NULL;
     LPTSTR     lpArgs[MAXARGS];
     CHAR       lpStrings[MAXARGS][STRLEN];
+    static CHAR lpLastStrings[MAXARGS][STRLEN];
     WORD       wNumArgs = 0;
-    WORD       wNumStrings = 0;
+    static WORD wLastNumArgs = MAXARGS;
+    static time_t lastMessageTime = 0;
+    static WORD wLastEventType = 0;
+    static DWORD dwLastEventID = 0;
+    time_t      now;
     DWORD       code;
+    BOOL        bLogMessage = TRUE;
+    WORD        i = 0, j;
 
     // Ensure that our event source is properly initialized.
     if (!AddEventSource())
@@ -221,6 +230,11 @@ LogEvent(WORD wEventType, DWORD dwEventID, ...)
     case MSG_SERVER_REPORTS_VSALVAGE:
     case MSG_SERVER_REPORTS_VNOSERVICE:
     case MSG_SERVER_REPORTS_VIO:
+    case MSG_SERVER_REPORTS_VBUSY:
+    case MSG_SERVER_REPORTS_VRESTARTING:
+    case MSG_ALL_SERVERS_BUSY:
+    case MSG_ALL_SERVERS_OFFLINE:
+    case MSG_ALL_SERVERS_DOWN:
        wNumArgs = 2;
        lpArgs[0] = va_arg(listArgs, LPTSTR);
        StringCbPrintf(lpStrings[1],STRLEN,"%d",va_arg(listArgs,afs_int32));
@@ -307,19 +321,63 @@ LogEvent(WORD wEventType, DWORD dwEventID, ...)
 
     // Make sure we were not given too many args.
     if (wNumArgs >= MAXARGS)
-       return;
+        goto done;
+
+    hMutex = CreateMutex( NULL, TRUE, "AFSD Event Log Mutex");
+    if (hMutex == NULL)
+        goto done;
+
+    if (GetLastError() == ERROR_ALREADY_EXISTS) {
+        code = WaitForSingleObject( hMutex, 500);
+        if (code != WAIT_OBJECT_0)
+            goto done;
+    }
+
+    /*
+     * We rate limit consecutive duplicate messages to one every
+     * five seconds.
+     */
+    now = time(NULL);
+    if (now < lastMessageTime + 5 &&
+        wEventType == wLastEventType &&
+        dwEventID == dwLastEventID &&
+        wNumArgs == wLastNumArgs) {
+        for (i=0; i<wNumArgs; i++) {
+            if ( strncmp(lpArgs[i], lpLastStrings[i], STRLEN))
+                break;
+        }
+        if (i == wNumArgs)
+            bLogMessage = FALSE;
+    }
+
+    if ( bLogMessage) {
+        wLastNumArgs = wNumArgs;
+        wLastEventType = wEventType;
+        dwLastEventID = dwEventID;
+        lastMessageTime = now;
+
+        for ( j = (i == wNumArgs ? 0 : i) ; i < wNumArgs; i++) {
+            StringCbCopyEx( lpLastStrings[i], STRLEN, lpArgs[i], NULL, NULL, STRSAFE_NULL_ON_FAILURE);
+        }
+    }
+
+    ReleaseMutex(hMutex);
 
     // Log the event.
-    code = ReportEvent(hEventSource,           // handle of event source
-                       wEventType,             // event type
-                       0,                      // event category
-                       dwEventID,              // event ID
-                       NULL,                   // current user's SID
-                       wNumArgs,               // strings in lpszArgs
-                       0,                      // no bytes of raw data
-                       wNumArgs ? lpArgs : NULL,// array of error strings
-                       NULL);                  // no raw data
+    if ( bLogMessage)
+        code = ReportEvent(hEventSource,               // handle of event source
+                           wEventType,         // event type
+                           0,                  // event category
+                           dwEventID,          // event ID
+                           NULL,                       // current user's SID
+                           wNumArgs,           // strings in lpszArgs
+                           0,                  // no bytes of raw data
+                           wNumArgs ? lpArgs : NULL,// array of error strings
+                           NULL);                      // no raw data
 
+  done:
+    if (hMutex)
+        CloseHandle(hMutex);
 
     DeregisterEventSource(hEventSource);
 }      
index c2afae4..9d6c136 100644 (file)
@@ -118,7 +118,7 @@ Server %1 reported volume %2 as not attached (may have been moved or deleted).
 .
 
 MessageId=
-Severity=Warning
+Severity=Informational
 Facility=System
 SymbolicName=MSG_SERVER_REPORTS_VMOVED
 Language=English
@@ -382,4 +382,45 @@ Language=English
 A dirty buffer was not written to the file server (Cell=%1, Volume=%2, Vnode=%3, Unique=%4, Offset=%5, Length=%6).
 .
 
+MessageId=
+Severity=Informational
+Facility=System
+SymbolicName=MSG_SERVER_REPORTS_VBUSY
+Language=English
+Server %1 reported busy when accessing volume %2.
+.
+
+MessageId=
+Severity=Informational
+Facility=System
+SymbolicName=MSG_SERVER_REPORTS_VRESTARTING
+Language=English
+Server %1 reported restarting when accessing volume %2.
+.
+
+MessageId=
+Severity=Informational
+Facility=System
+SymbolicName=MSG_ALL_SERVERS_BUSY
+Language=English
+All servers are busy when accessing cell %1 volume %2.
+.
+
+MessageId=
+Severity=Informational
+Facility=System
+SymbolicName=MSG_ALL_SERVERS_OFFLINE
+Language=English
+All servers are offline when accessing cell %1 volume %2.
+.
+
+MessageId=
+Severity=Informational
+Facility=System
+SymbolicName=MSG_ALL_SERVERS_DOWN
+Language=English
+All servers are unreachable when accessing cell %1 volume %2.
+.
+
+
 ;#endif /* __AFSD_EVENTMESSAGES_H_ 1 */
index d3e3802..4cefe7f 100644 (file)
@@ -230,6 +230,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     long code;
     char addr[16]="unknown";
     int forcing_new = 0;
+    char *format;
+    DWORD msgID;
 
     osi_Log2(afsd_logp, "cm_Analyze connp 0x%p, code 0x%x",
              connp, errorCode);
@@ -314,19 +316,32 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     }
 
     else if (errorCode == CM_ERROR_ALLDOWN) {
-       osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN.");
        /* Servers marked DOWN will be restored by the background daemon
         * thread as they become available.  The volume status is 
          * updated as the server state changes.
         */
+        if (fidp) {
+            osi_Log2(afsd_logp, "cm_Analyze passed CM_ERROR_DOWN (FS cell %s vol 0x%x)",
+                      cellp->name, fidp->volume);
+            msgID = MSG_ALL_SERVERS_DOWN;
+            format = "All servers are unreachable when accessing cell %s volume %d.";
+           LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume);
+        } else {
+            osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN (VL Server)");
+        }
     }
 
     else if (errorCode == CM_ERROR_ALLOFFLINE) {
-        osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE.");
         /* Volume instances marked offline will be restored by the 
          * background daemon thread as they become available 
          */
         if (fidp) {
+            osi_Log2(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE (FS cell %s vol 0x%x)",
+                      cellp->name, fidp->volume);
+            msgID = MSG_ALL_SERVERS_OFFLINE;
+            format = "All servers are offline when accessing cell %s volume %d.";
+           LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume);
+
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
                                       CM_GETVOL_FLAG_NO_LRU_UPDATE, 
                                       &volp);
@@ -345,15 +360,21 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 lock_ReleaseRead(&cm_volumeLock);
                 volp = NULL;
             }
-        } 
+        } else {
+            osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE (VL Server)");
+        }
     }
     else if (errorCode == CM_ERROR_ALLBUSY) {
         /* Volumes that are busy cannot be determined to be non-busy 
          * without actually attempting to access them.
          */
-       osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY.");
-
         if (fidp) { /* File Server query */
+            osi_Log2(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY (FS cell %s vol 0x%x)",
+                     cellp->name, fidp->volume);
+            msgID = MSG_ALL_SERVERS_BUSY;
+            format = "All servers are busy when accessing cell %s volume %d.";
+           LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume);
+
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
                                      CM_GETVOL_FLAG_NO_LRU_UPDATE, 
                                      &volp);
@@ -403,6 +424,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 volp = NULL;
             }
         } else {    /* VL Server query */
+            osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY (VL Server).");
+
             if (timeLeft > 7) {
                 thrd_Sleep(5000);
 
@@ -431,6 +454,30 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 free_svr_list = 1;
             }
         }
+
+        switch ( errorCode ) {
+        case VBUSY:
+           msgID = MSG_SERVER_REPORTS_VBUSY;
+            format = "Server %s reported busy when accessing volume %d.";
+            break;
+        case VRESTARTING:
+           msgID = MSG_SERVER_REPORTS_VRESTARTING;
+            format = "Server %s reported restarting when accessing volume %d.";
+            break;
+        }
+
+        if (serverp && fidp) {
+            /* Log server being offline for this volume */
+            sprintf(addr, "%d.%d.%d.%d",
+                   ((serverp->addr.sin_addr.s_addr & 0xff)),
+                   ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                   ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                   ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
+
+           osi_Log2(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume);
+           LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume);
+        }
+
         lock_ObtainWrite(&cm_serverLock);
         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
             if (tsrp->status == srv_deleted)
@@ -472,9 +519,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     else if (errorCode == VNOVOL || errorCode == VMOVED || errorCode == VOFFLINE ||
              errorCode == VSALVAGE || errorCode == VNOSERVICE || errorCode == VIO) 
     {       
-        char *format;
-       DWORD msgID;
-
         /* In case of timeout */
         reqp->volumeError = errorCode;