Windows: Log RXKAD errors in cm_Analyze
[openafs.git] / src / WINNT / afsd / cm_conn.c
index 693b7fe..5c6b764 100644 (file)
@@ -30,6 +30,7 @@ unsigned short IdleDeadtimeout = CM_CONN_IDLEDEADTIME;
 
 #define LANMAN_WKS_PARAM_KEY "SYSTEM\\CurrentControlSet\\Services\\lanmanworkstation\\parameters"
 #define LANMAN_WKS_SESSION_TIMEOUT "SessTimeout"
+#define LANMAN_WKS_EXT_SESSION_TIMEOUT "ExtendedSessTimeout"
 
 afs_uint32 cryptall = 0;
 afs_uint32 cm_anonvldb = 0;
@@ -63,11 +64,36 @@ void cm_InitConn(void)
                            0, KEY_QUERY_VALUE, &parmKey);
        if (code == ERROR_SUCCESS)
         {
-           dummyLen = sizeof(DWORD);
-           code = RegQueryValueEx(parmKey, LANMAN_WKS_SESSION_TIMEOUT, NULL, NULL, 
-                                  (BYTE *) &dwValue, &dummyLen);
-           if (code == ERROR_SUCCESS)
-                RDRtimeout = dwValue;
+            BOOL extTimeouts = msftSMBRedirectorSupportsExtendedTimeouts();
+
+            if (extTimeouts) {
+                /* 
+                 * The default value is 1000 seconds.  However, this default
+                 * will not apply to "\\AFS" unless "AFS" is listed in 
+                 * ServersWithExtendedSessTimeout which we will add when we
+                 * create the ExtendedSessTimeout value in smb_Init()
+                 */
+                dummyLen = sizeof(DWORD);
+                code = RegQueryValueEx(parmKey, 
+                                       LANMAN_WKS_EXT_SESSION_TIMEOUT,
+                                        NULL, NULL,
+                                        (BYTE *) &dwValue, &dummyLen);
+                if (code == ERROR_SUCCESS) {
+                    RDRtimeout = dwValue;
+                    afsi_log("lanmanworkstation : ExtSessTimeout %u", RDRtimeout);
+                }
+            } 
+            if (!extTimeouts || code != ERROR_SUCCESS) {
+                dummyLen = sizeof(DWORD);
+                code = RegQueryValueEx(parmKey, 
+                                       LANMAN_WKS_SESSION_TIMEOUT,
+                                       NULL, NULL,
+                                       (BYTE *) &dwValue, &dummyLen);
+                if (code == ERROR_SUCCESS) {
+                    RDRtimeout = dwValue;
+                    afsi_log("lanmanworkstation : SessTimeout %u", RDRtimeout);
+                }
+            }
            RegCloseKey(parmKey);
         }
 
@@ -98,17 +124,27 @@ void cm_InitConn(void)
             RegCloseKey(parmKey);
        }
 
-       afsi_log("lanmanworkstation : SessTimeout %u", RDRtimeout);
+        /* 
+         * If these values were not set via cpp macro or obtained 
+         * from the registry, we use a value that is derived from
+         * the smb redirector session timeout (RDRtimeout).
+         *
+         * The UNIX cache manager uses 120 seconds for the hard dead
+         * timeout and 50 seconds for the connection and idle timeouts.
+         *
+         * We base our values on those while making sure we leave 
+         * enough time for overhead.  
+         */
        if (ConnDeadtimeout == 0) {
-           ConnDeadtimeout = (unsigned short) (RDRtimeout / 2);
+           ConnDeadtimeout = (unsigned short) ((RDRtimeout / 2) < 50 ? (RDRtimeout / 2) : 50);
             afsi_log("ConnDeadTimeout is %d", ConnDeadtimeout);
         }
        if (HardDeadtimeout == 0) {
-           HardDeadtimeout = (unsigned short) RDRtimeout;
+           HardDeadtimeout = (unsigned short) (RDRtimeout > 125 ? 120 : (RDRtimeout - 5));
             afsi_log("HardDeadTimeout is %d", HardDeadtimeout);
         }
-       if (ConnDeadtimeout == 0) {
-           IdleDeadtimeout = (unsigned short) RDRtimeout;
+       if (IdleDeadtimeout == 0) {
+           IdleDeadtimeout = (unsigned short) ConnDeadtimeout;
             afsi_log("IdleDeadTimeout is %d", IdleDeadtimeout);
         }
        osi_EndOnce(&once);
@@ -604,33 +640,61 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
 
     /* RX codes */
     else if (errorCode == RX_CALL_TIMEOUT) {
-        /* server took longer than hardDeadTime 
+        /* RPC took longer than hardDeadTime or the server
+         * reported idle for longer than idleDeadTime
          * don't mark server as down but don't retry
          * this is to prevent the SMB session from timing out
          * In addition, we log an event to the event log 
          */
 
         if (serverp) {
-        /* Log server being offline for this volume */
-        sprintf(addr, "%d.%d.%d.%d", 
-                 ((serverp->addr.sin_addr.s_addr & 0xff)),
-                 ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
-                 ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
-                 ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
-
-       LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
-         
-        osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime exceeded addr[%s]",
-                osi_LogSaveString(afsd_logp,addr));
-        reqp->tokenIdleErrorServp = serverp;
-        reqp->idleError++;
-        retry = 1;
-    }
+            sprintf(addr, "%d.%d.%d.%d", 
+                    ((serverp->addr.sin_addr.s_addr & 0xff)),
+                    ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                    ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                    ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
+            LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
+            osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]",
+                     osi_LogSaveString(afsd_logp,addr));
+            reqp->tokenIdleErrorServp = serverp;
+            reqp->idleError++;
+
+            if (timeLeft > 2) {
+                if (!fidp) { /* vldb */
+                    retry = 1;
+                } else { /* file */
+                    cm_volume_t *volp = cm_GetVolumeByFID(fidp);
+                    if (volp) {
+                        if (fidp->volume == cm_GetROVolumeID(volp))
+                            retry = 1;
+                        cm_PutVolume(volp);
+                    }
+                }
+            }
+        }
     }
     else if (errorCode >= -64 && errorCode < 0) {
         /* mark server as down */
+        sprintf(addr, "%d.%d.%d.%d", 
+                ((serverp->addr.sin_addr.s_addr & 0xff)),
+                ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
+        if (errorCode == RX_CALL_DEAD)
+            osi_Log2(afsd_logp, "cm_Analyze: Rx Call Dead addr[%s] forcedNew[%s]",
+                     osi_LogSaveString(afsd_logp,addr), 
+                     (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no"));
+        else
+            osi_Log3(afsd_logp, "cm_Analyze: Rx Misc Error[%d] addr[%s] forcedNew[%s]",
+                     errorCode,
+                     osi_LogSaveString(afsd_logp,addr), 
+                     (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no"));
+
         lock_ObtainMutex(&serverp->mx);
-       if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
+       if (errorCode != RX_CALL_DEAD || 
+            (reqp->flags & CM_REQ_NEW_CONN_FORCED)) {
             if (!(serverp->flags & CM_SERVERFLAG_DOWN)) {
                 serverp->flags |= CM_SERVERFLAG_DOWN;
                 serverp->downTime = time(NULL);
@@ -645,6 +709,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
             retry = 1;
     }
     else if (errorCode == RXKADEXPIRED) {
+        osi_Log1(afsd_logp, "cm_Analyze: rxkad error code 0x%x (RXKADEXPIRED)",
+                 errorCode);
         if (!dead_session) {
             lock_ObtainMutex(&userp->mx);
             ucellp = cm_GetUCell(userp, serverp->cellp);
@@ -659,24 +725,43 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 retry = 1;
         }
     } else if (errorCode >= ERROR_TABLE_BASE_RXK && errorCode < ERROR_TABLE_BASE_RXK + 256) {
+        char * s = "unknown error";
+        switch ( errorCode ) {
+        case RXKADINCONSISTENCY: s = "RXKADINCONSISTENCY"; break;
+        case RXKADPACKETSHORT  : s = "RXKADPACKETSHORT";   break;
+        case RXKADLEVELFAIL    : s = "RXKADLEVELFAIL";     break;
+        case RXKADTICKETLEN    : s = "RXKADTICKETLEN";     break;
+        case RXKADOUTOFSEQUENCE: s = "RXKADOUTOFSEQUENCE"; break;
+        case RXKADNOAUTH       : s = "RXKADNOAUTH";        break;
+        case RXKADBADKEY       : s = "RXKADBADKEY";        break;
+        case RXKADBADTICKET    : s = "RXKADBADTICKET";     break;
+        case RXKADUNKNOWNKEY   : s = "RXKADUNKNOWNKEY";    break;
+        case RXKADEXPIRED      : s = "RXKADEXPIRED";       break;
+        case RXKADSEALEDINCON  : s = "RXKADSEALEDINCON";   break;
+        case RXKADDATALEN      : s = "RXKADDATALEN";       break;
+        case RXKADILLEGALLEVEL : s = "RXKADILLEGALLEVEL";  break;
+        }
+        osi_Log2(afsd_logp, "cm_Analyze: rxkad error code 0x%x (%s)",
+                  errorCode, s);
+
         if (serverp) {
-        reqp->tokenIdleErrorServp = serverp;
-        reqp->tokenError = errorCode;
-        retry = 1;
+            reqp->tokenIdleErrorServp = serverp;
+            reqp->tokenError = errorCode;
+            retry = 1;
         }
     } else if (errorCode >= ERROR_TABLE_BASE_U && errorCode < ERROR_TABLE_BASE_U + 256) {
-      /*
-       * We received a ubik error.  its possible that the server we are
-       * communicating with has a corrupted database or is partitioned
-       * from the rest of the servers and another server might be able
-       * to answer our query.  Therefore, we will retry the request
-       * and force the use of another server.
-       */
-      if (serverp) {
-       reqp->tokenIdleErrorServp = serverp;
-       reqp->tokenError = errorCode;
-       retry = 1;
-      }
+        /*
+         * We received a ubik error.  its possible that the server we are
+         * communicating with has a corrupted database or is partitioned
+         * from the rest of the servers and another server might be able
+         * to answer our query.  Therefore, we will retry the request
+         * and force the use of another server.
+         */
+        if (serverp) {
+            reqp->tokenIdleErrorServp = serverp;
+            reqp->tokenError = errorCode;
+            retry = 1;
+        }
     } else if (errorCode == VICECONNBAD || errorCode == VICETOKENDEAD) {
        cm_ForceNewConnections(serverp);
         if ( timeLeft > 2 )
@@ -685,19 +770,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         if (errorCode) {
             char * s = "unknown error";
             switch ( errorCode ) {
-            case RXKADINCONSISTENCY: s = "RXKADINCONSISTENCY"; break;
-            case RXKADPACKETSHORT  : s = "RXKADPACKETSHORT";   break;
-            case RXKADLEVELFAIL    : s = "RXKADLEVELFAIL";     break;
-            case RXKADTICKETLEN    : s = "RXKADTICKETLEN";     break;
-            case RXKADOUTOFSEQUENCE: s = "RXKADOUTOFSEQUENCE"; break;
-            case RXKADNOAUTH       : s = "RXKADNOAUTH";        break;
-            case RXKADBADKEY       : s = "RXKADBADKEY";        break;
-            case RXKADBADTICKET    : s = "RXKADBADTICKET";     break;
-            case RXKADUNKNOWNKEY   : s = "RXKADUNKNOWNKEY";    break;
-            case RXKADEXPIRED      : s = "RXKADEXPIRED";       break;
-            case RXKADSEALEDINCON  : s = "RXKADSEALEDINCON";   break;
-            case RXKADDATALEN      : s = "RXKADDATALEN";       break;
-            case RXKADILLEGALLEVEL : s = "RXKADILLEGALLEVEL";  break;
             case VSALVAGE          : s = "VSALVAGE";           break;
             case VNOVNODE          : s = "VNOVNODE";           break;
             case VNOVOL            : s = "VNOVOL";             break;