2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
13 #include <afsconfig.h>
14 #include "afs/param.h"
20 #include "afs/sysincludes.h" /* Standard vendor system headers */
23 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_FBSD_ENV)
25 #include <netinet/in.h>
29 #include "h/hashing.h"
31 #if !defined(AFS_HPUX110_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_FBSD_ENV) && !defined(AFS_DARWIN60_ENV)
32 #include <netinet/in_var.h>
36 #include "afsincludes.h" /* Afs-based standard headers */
37 #include "afs/afs_stats.h" /* afs statistics */
38 #include "afs/afs_util.h"
39 #include "afs/unified_afs.h"
41 #if defined(AFS_SUN56_ENV)
43 #include <inet/common.h>
44 #if defined(AFS_SUN58_ENV)
45 #include <netinet/ip6.h>
51 /* shouldn't do it this way, but for now will do */
52 #ifndef ERROR_TABLE_BASE_U
53 #define ERROR_TABLE_BASE_U (5376L)
54 #endif /* ubik error base define */
56 /* shouldn't do it this way, but for now will do */
57 #ifndef ERROR_TABLE_BASE_uae
58 #define ERROR_TABLE_BASE_uae (49733376L)
59 #endif /* unified afs error base define */
61 /* same hack for vlserver error base as for ubik error base */
62 #ifndef ERROR_TABLE_BASE_VL
63 #define ERROR_TABLE_BASE_VL (363520L)
64 #define VL_NOENT (363524L)
65 #endif /* vlserver error base define */
68 int afs_BusyWaitPeriod = 15; /* poll every 15 seconds */
70 afs_int32 hm_retry_RO = 0; /* don't wait */
71 afs_int32 hm_retry_RW = 0; /* don't wait */
72 afs_int32 hm_retry_int = 0; /* don't wait */
74 #define VSleep(at) afs_osi_Wait((at)*1000, 0, 0)
79 * 0 if the vldb record for a specific volume is different from what
80 * we have cached -- perhaps the volume has moved.
81 * 1 if the vldb record is the same
82 * 2 if we can't tell if it's the same or not.
84 * If 0, the caller will probably start over at the beginning of our
85 * list of servers for this volume and try to find one that is up. If
86 * not 0, we will probably just keep plugging with what we have
87 * cached. If we fail to contact the VL server, we should just keep
88 * trying with the information we have, rather than failing. */
93 VLDB_Same(struct VenusFid *afid, struct vrequest *areq)
100 struct nvldbentry ntve;
101 struct uvldbentry utve;
105 char *bp, tbuf[CVBS]; /* biggest volume id is 2^32, ~ 4*10^9 */
106 unsigned int changed;
107 struct server *(oldhosts[NMAXNSERVERS]);
109 AFS_STATCNT(CheckVLDB);
110 afs_FinalizeReq(areq);
112 if ((i = afs_InitReq(&treq, afs_osi_credp)))
114 v = afs_osi_Alloc(sizeof(*v));
115 tcell = afs_GetCell(afid->Cell, READ_LOCK);
116 bp = afs_cv2string(&tbuf[CVBS], afid->Fid.Volume);
118 VSleep(2); /* Better safe than sorry. */
120 afs_ConnByMHosts(tcell->cellHosts, tcell->vlport, tcell->cellNum,
123 if (tconn->srvr->server->flags & SNO_LHOSTS) {
126 i = VL_GetEntryByNameO(tconn->id, bp, &v->tve);
128 } else if (tconn->srvr->server->flags & SYES_LHOSTS) {
131 i = VL_GetEntryByNameN(tconn->id, bp, &v->ntve);
136 i = VL_GetEntryByNameU(tconn->id, bp, &v->utve);
138 if (!(tconn->srvr->server->flags & SVLSRV_UUID)) {
139 if (i == RXGEN_OPCODE) {
142 i = VL_GetEntryByNameN(tconn->id, bp, &v->ntve);
144 if (i == RXGEN_OPCODE) {
146 tconn->srvr->server->flags |= SNO_LHOSTS;
148 i = VL_GetEntryByNameO(tconn->id, bp, &v->tve);
151 tconn->srvr->server->flags |= SYES_LHOSTS;
153 tconn->srvr->server->flags |= SVLSRV_UUID;
159 } while (afs_Analyze(tconn, i, NULL, &treq, -1, /* no op code for this */
160 SHARED_LOCK, tcell));
162 afs_PutCell(tcell, READ_LOCK);
163 afs_Trace2(afs_iclSetp, CM_TRACE_CHECKVLDB, ICL_TYPE_FID, &afid,
167 afs_osi_Free(v, sizeof(*v));
170 /* have info, copy into serverHost array */
172 tvp = afs_FindVolume(afid, WRITE_LOCK);
174 ObtainWriteLock(&tvp->lock, 107);
175 for (i = 0; i < NMAXNSERVERS && tvp->serverHost[i]; i++) {
176 oldhosts[i] = tvp->serverHost[i];
180 InstallUVolumeEntry(tvp, &v->utve, afid->Cell, tcell, &treq);
181 } else if (type == 1) {
182 InstallNVolumeEntry(tvp, &v->ntve, afid->Cell);
184 InstallVolumeEntry(tvp, &v->tve, afid->Cell);
187 if (i < NMAXNSERVERS && tvp->serverHost[i]) {
190 for (--i; !changed && i >= 0; i--) {
191 if (tvp->serverHost[i] != oldhosts[i]) {
192 changed = 1; /* also happens if prefs change. big deal. */
196 ReleaseWriteLock(&tvp->lock);
197 afs_PutVolume(tvp, WRITE_LOCK);
198 } else { /* can't find volume */
199 tvp = afs_GetVolume(afid, &treq, WRITE_LOCK);
201 afs_PutVolume(tvp, WRITE_LOCK);
202 afs_osi_Free(v, sizeof(*v));
205 afs_osi_Free(v, sizeof(*v));
210 afs_osi_Free(v, sizeof(*v));
211 return (changed ? DIFFERENT : SAME);
215 /*------------------------------------------------------------------------
216 * EXPORTED afs_Analyze
219 * Analyze the outcome of an RPC operation, taking whatever support
220 * actions are necessary.
223 * aconn : Ptr to the relevant connection on which the call was made.
224 * acode : The return code experienced by the RPC.
225 * afid : The FID of the file involved in the action. This argument
226 * may be null if none was involved.
227 * areq : The request record associated with this operation.
228 * op : which RPC we are analyzing.
229 * cellp : pointer to a cell struct. Must provide either fid or cell.
232 * Non-zero value if the related RPC operation should be retried,
236 * This routine is typically called in a do-while loop, causing the
237 * embedded RPC operation to be called repeatedly if appropriate
238 * until whatever error condition (if any) is intolerable.
244 * The retry return value is used by afs_StoreAllSegments to determine
245 * if this is a temporary or permanent error.
246 *------------------------------------------------------------------------*/
248 afs_Analyze(register struct conn *aconn, afs_int32 acode,
249 struct VenusFid *afid, register struct vrequest *areq, int op,
250 afs_int32 locktype, struct cell *cellp)
256 afs_int32 shouldRetry = 0;
257 struct afs_stats_RPCErrors *aerrP;
259 AFS_STATCNT(afs_Analyze);
260 afs_Trace4(afs_iclSetp, CM_TRACE_ANALYZE, ICL_TYPE_INT32, op,
261 ICL_TYPE_POINTER, aconn, ICL_TYPE_INT32, acode, ICL_TYPE_LONG,
264 aerrP = (struct afs_stats_RPCErrors *)0;
266 if ((op >= 0) && (op < AFS_STATS_NUM_FS_RPC_OPS))
267 aerrP = &(afs_stats_cmfullperf.rpc.fsRPCErrors[op]);
269 afs_FinalizeReq(areq);
270 if (!aconn && areq->busyCount) { /* one RPC or more got VBUSY/VRESTARTING */
272 tvp = afs_FindVolume(afid, READ_LOCK);
274 afs_warnuser("afs: Waiting for busy volume %u (%s) in cell %s\n",
275 (afid ? afid->Fid.Volume : 0),
276 (tvp->name ? tvp->name : ""),
278 && tvp->serverHost[0]->cell) ? tvp->serverHost[0]->
279 cell->cellName : ""));
281 for (i = 0; i < MAXHOSTS; i++) {
282 if (tvp->status[i] != not_busy && tvp->status[i] != offline) {
283 tvp->status[i] = not_busy;
285 if (tvp->status[i] == not_busy)
288 afs_PutVolume(tvp, READ_LOCK);
290 afs_warnuser("afs: Waiting for busy volume %u\n",
291 (afid ? afid->Fid.Volume : 0));
294 if (areq->busyCount > 100) {
296 (aerrP->err_Volume)++;
297 areq->volumeError = VOLBUSY;
300 VSleep(afs_BusyWaitPeriod); /* poll periodically */
302 if (shouldRetry != 0)
305 return shouldRetry; /* should retry */
309 if (!areq->volumeError) {
311 (aerrP->err_Network)++;
312 if (hm_retry_int && !(areq->flags & O_NONBLOCK) && /* "hard" mount */
313 ((afid && afs_IsPrimaryCellNum(afid->Cell))
314 || (cellp && afs_IsPrimaryCell(cellp)))) {
317 ("afs: hard-mount waiting for a vlserver to return to service\n");
318 VSleep(hm_retry_int);
319 afs_CheckServers(1, cellp);
322 tvp = afs_FindVolume(afid, READ_LOCK);
323 if (!tvp || (tvp->states & VRO)) {
324 shouldRetry = hm_retry_RO;
326 shouldRetry = hm_retry_RW;
329 afs_PutVolume(tvp, READ_LOCK);
332 ("afs: hard-mount waiting for volume %u\n",
334 VSleep(hm_retry_int);
335 afs_CheckServers(1, cellp);
338 } /* if (hm_retry_int ... */
340 areq->networkError = 1;
346 /* Find server associated with this connection. */
350 /* Before we do anything with acode, make sure we translate it back to
352 if ((acode & ~0xff) == ERROR_TABLE_BASE_uae)
353 acode = et_to_sys_error(acode);
356 /* If we previously took an error, mark this volume not busy */
357 if (areq->volumeError) {
358 tvp = afs_FindVolume(afid, READ_LOCK);
360 for (i = 0; i < MAXHOSTS; i++) {
361 if (tvp->serverHost[i] == tsp) {
362 tvp->status[i] = not_busy;
365 afs_PutVolume(tvp, READ_LOCK);
369 afs_PutConn(aconn, locktype);
373 /* If network troubles, mark server as having bogued out again. */
374 /* VRESTARTING is < 0 because of backward compatibility issues
375 * with 3.4 file servers and older cache managers */
376 #ifdef AFS_64BIT_CLIENT
379 #endif /* AFS_64BIT_CLIENT */
380 if ((acode < 0) && (acode != VRESTARTING)) {
382 ForceNewConnections(sa); /*multi homed clients lock:afs_xsrvAddr? */
384 (aerrP->err_Server)++;
387 if (acode == VBUSY || acode == VRESTARTING) {
388 if (acode == VBUSY) {
391 (aerrP->err_VolumeBusies)++;
395 tvp = afs_FindVolume(afid, READ_LOCK);
397 for (i = 0; i < MAXHOSTS; i++) {
398 if (tvp->serverHost[i] == tsp) {
399 tvp->status[i] = rdwr_busy; /* can't tell which yet */
400 /* to tell which, have to look at the op code. */
403 afs_PutVolume(tvp, READ_LOCK);
405 afs_warnuser("afs: Waiting for busy volume %u in cell %s\n",
406 (afid ? afid->Fid.Volume : 0), tsp->cell->cellName);
407 VSleep(afs_BusyWaitPeriod); /* poll periodically */
411 } else if (acode == VICETOKENDEAD
412 || (acode & ~0xff) == ERROR_TABLE_BASE_RXK) {
413 /* any rxkad error is treated as token expiration */
417 * I'm calling these errors protection errors, since they involve
418 * faulty authentication.
421 (aerrP->err_Protection)++;
423 tu = afs_FindUser(areq->uid, tsp->cell->cellNum, READ_LOCK);
425 if (acode == VICETOKENDEAD) {
426 aconn->forceConnectFS = 1; /* don't check until new tokens set */
427 shouldRetry = 1; /* Try again (as root). */
428 } else if (acode == RXKADEXPIRED)
430 ("afs: Tokens for user of AFS id %d for cell %s have expired\n",
431 tu->vid, aconn->srvr->server->cell->cellName);
434 ("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n",
435 tu->vid, aconn->srvr->server->cell->cellName, acode);
436 afs_PutUser(tu, READ_LOCK);
438 /* The else case shouldn't be possible and should probably be replaced by a panic? */
439 if (acode == VICETOKENDEAD) {
440 aconn->forceConnectFS = 1; /* don't check until new tokens set */
441 shouldRetry = 1; /* Try again (as root). */
442 } else if (acode == RXKADEXPIRED)
444 ("afs: Tokens for user %d for cell %s have expired\n",
445 areq->uid, aconn->srvr->server->cell->cellName);
448 ("afs: Tokens for user %d for cell %s are discarded (rxkad error = %d)\n",
449 areq->uid, aconn->srvr->server->cell->cellName, acode);
451 aconn->forceConnectFS = 0; /* don't check until new tokens set */
452 aconn->user->states |= UTokensBad;
453 shouldRetry = 1; /* Try again (as root). */
455 /* Check for access violation. */
456 else if (acode == EACCES) {
457 /* should mark access error in non-existent per-user global structure */
459 (aerrP->err_Protection)++;
460 areq->accessError = 1;
461 if (op == AFS_STATS_FS_RPCIDX_STOREDATA)
462 areq->permWriteError = 1;
465 /* check for ubik errors; treat them like crashed servers */
466 else if (acode >= ERROR_TABLE_BASE_U && acode < ERROR_TABLE_BASE_U + 255) {
469 (aerrP->err_Server)++;
470 shouldRetry = 1; /* retryable (maybe one is working) */
471 VSleep(1); /* just in case */
473 /* Check for bad volume data base / missing volume. */
474 else if (acode == VSALVAGE || acode == VOFFLINE || acode == VNOVOL
475 || acode == VNOSERVICE || acode == VMOVED) {
480 areq->volumeError = VOLMISSING;
482 (aerrP->err_Volume)++;
483 if (afid && (tcell = afs_GetCell(afid->Cell, 0))) {
484 same = VLDB_Same(afid, areq);
485 tvp = afs_FindVolume(afid, READ_LOCK);
487 for (i = 0; i < MAXHOSTS && tvp->serverHost[i]; i++) {
488 if (tvp->serverHost[i] == tsp) {
489 if (tvp->status[i] == end_not_busy)
490 tvp->status[i] = offline;
494 tvp->status[i] = not_busy; /* reset the others */
497 afs_PutVolume(tvp, READ_LOCK);
500 } else if (acode >= ERROR_TABLE_BASE_VL && acode <= ERROR_TABLE_BASE_VL + 255) { /* vlserver errors */
502 areq->volumeError = VOLMISSING;
503 } else if (acode >= 0) {
505 (aerrP->err_Other)++;
506 if (op == AFS_STATS_FS_RPCIDX_STOREDATA)
507 areq->permWriteError = 1;
508 shouldRetry = 0; /* Other random Vice error. */
509 } else if (acode == RX_MSGSIZE) { /* same meaning as EMSGSIZE... */
510 VSleep(1); /* Just a hack for desperate times. */
512 (aerrP->err_Other)++;
513 shouldRetry = 1; /* packet was too big, please retry call */
516 if (acode < 0 && acode != RX_MSGSIZE && acode != VRESTARTING) {
517 /* If we get here, code < 0 and we have network/Server troubles.
518 * areq->networkError is not set here, since we always
519 * retry in case there is another server. However, if we find
520 * no connection (aconn == 0) we set the networkError flag.
522 afs_MarkServerUpOrDown(sa, SRVR_ISDOWN);
524 (aerrP->err_Server)++;
525 VSleep(1); /* Just a hack for desperate times. */
529 /* now unlock the connection and return */
530 afs_PutConn(aconn, locktype);
531 return (shouldRetry);