src/afs/LINUX/osi_vnodeops.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /*
  11  * Linux specific vnodeops. Also includes the glue routines required to call
  12  * AFS vnodeops.
  13  *
  14  * So far the only truly scary part is that Linux relies on the inode cache
  15  * to be up to date. Don't you dare break a callback and expect an fstat
  16  * to give you meaningful information. This appears to be fixed in the 2.1
  17  * development kernels. As it is we can fix this now by intercepting the
  18  * stat calls.
  19  */
  20
  21 #include <afsconfig.h>
  22 #include "afs/param.h"
  23
  24
  25 #include "afs/sysincludes.h"
  26 #include "afsincludes.h"
  27 #include "afs/afs_stats.h"
  28 #include <linux/mm.h>
  29 #ifdef HAVE_MM_INLINE_H
  30 #include <linux/mm_inline.h>
  31 #endif
  32 #include <linux/pagemap.h>
  33 #include <linux/writeback.h>
  34 #include <linux/pagevec.h>
  35 #include <linux/aio.h>
  36 #include "afs/lock.h"
  37 #include "afs/afs_bypasscache.h"
  38
  39 #include "osi_compat.h"
  40 #include "osi_pagecopy.h"
  41
  42 #ifndef HAVE_LINUX_PAGEVEC_LRU_ADD_FILE
  43 #define __pagevec_lru_add_file __pagevec_lru_add
  44 #endif
  45
  46 #ifndef MAX_ERRNO
  47 #define MAX_ERRNO 1000L
  48 #endif
  49
  50 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
  51 /* Enable our workaround for a race with d_splice_alias. The race was fixed in
  52  * 2.6.34, so don't do it after that point. */
  53 # define D_SPLICE_ALIAS_RACE
  54 #endif
  55
  56 /* Workaround for RH 7.5 which introduced file operation iterate() but requires
  57  * each file->f_mode to be marked with FMODE_KABI_ITERATE.  Instead OpenAFS will
  58  * continue to use file opearation readdir() in this case.
  59  */
  60 #if defined(STRUCT_FILE_OPERATIONS_HAS_ITERATE) && !defined(FMODE_KABI_ITERATE)
  61 #define USE_FOP_ITERATE 1
  62 #else
  63 #undef USE_FOP_ITERATE
  64 #endif
  65
  66 int cachefs_noreadpage = 0;
  67
  68 extern struct backing_dev_info *afs_backing_dev_info;
  69
  70 extern struct vcache *afs_globalVp;
  71
  72 /* This function converts a positive error code from AFS into a negative
  73  * code suitable for passing into the Linux VFS layer. It checks that the
  74  * error code is within the permissable bounds for the ERR_PTR mechanism.
  75  *
  76  * _All_ error codes which come from the AFS layer should be passed through
  77  * this function before being returned to the kernel.
  78  */
  79
  80 static inline int
  81 afs_convert_code(int code) {
  82     if ((code >= 0) && (code <= MAX_ERRNO))
  83         return -code;
  84     else
  85         return -EIO;
  86 }
  87
  88 /* Linux doesn't require a credp for many functions, and crref is an expensive
  89  * operation. This helper function avoids obtaining it for VerifyVCache calls
  90  */
  91
  92 static inline int
  93 afs_linux_VerifyVCache(struct vcache *avc, cred_t **retcred) {
  94     cred_t *credp = NULL;
  95     struct vrequest *treq = NULL;
  96     int code;
  97
  98     if (avc->f.states & CStatd) {
  99         if (retcred)
 100             *retcred = NULL;
 101         return 0;
 102     }
 103
 104     credp = crref();
 105
 106     code = afs_CreateReq(&treq, credp);
 107     if (code == 0) {
 108         code = afs_VerifyVCache2(avc, treq);
 109         afs_DestroyReq(treq);
 110     }
 111
 112     if (retcred != NULL)
 113         *retcred = credp;
 114     else
 115         crfree(credp);
 116
 117     return afs_convert_code(code);
 118 }
 119
 120 #if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER) || defined(HAVE_LINUX_GENERIC_FILE_AIO_READ)
 121 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 122 static ssize_t
 123 afs_linux_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 124 # elif defined(LINUX_HAS_NONVECTOR_AIO)
 125 static ssize_t
 126 afs_linux_aio_read(struct kiocb *iocb, char __user *buf, size_t bufsize,
 127                    loff_t pos)
 128 # else
 129 static ssize_t
 130 afs_linux_aio_read(struct kiocb *iocb, const struct iovec *buf,
 131                    unsigned long bufsize, loff_t pos)
 132 # endif
 133 {
 134     struct file *fp = iocb->ki_filp;
 135     ssize_t code = 0;
 136     struct vcache *vcp = VTOAFS(fp->f_dentry->d_inode);
 137 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 138     loff_t pos = iocb->ki_pos;
 139     unsigned long bufsize = iter->nr_segs;
 140 # endif
 141
 142
 143     AFS_GLOCK();
 144     afs_Trace4(afs_iclSetp, CM_TRACE_AIOREADOP, ICL_TYPE_POINTER, vcp,
 145                ICL_TYPE_OFFSET, ICL_HANDLE_OFFSET(pos), ICL_TYPE_INT32,
 146                (afs_int32)bufsize, ICL_TYPE_INT32, 99999);
 147     code = afs_linux_VerifyVCache(vcp, NULL);
 148
 149     if (code == 0) {
 150         /* Linux's FlushPages implementation doesn't ever use credp,
 151          * so we optimise by not using it */
 152         osi_FlushPages(vcp, NULL);      /* ensure stale pages are gone */
 153         AFS_GUNLOCK();
 154 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 155         code = generic_file_read_iter(iocb, iter);
 156 # else
 157         code = generic_file_aio_read(iocb, buf, bufsize, pos);
 158 # endif
 159         AFS_GLOCK();
 160     }
 161
 162     afs_Trace4(afs_iclSetp, CM_TRACE_AIOREADOP, ICL_TYPE_POINTER, vcp,
 163                ICL_TYPE_OFFSET, ICL_HANDLE_OFFSET(pos), ICL_TYPE_INT32,
 164                (afs_int32)bufsize, ICL_TYPE_INT32, code);
 165     AFS_GUNLOCK();
 166     return code;
 167 }
 168 #else
 169 static ssize_t
 170 afs_linux_read(struct file *fp, char *buf, size_t count, loff_t * offp)
 171 {
 172     ssize_t code = 0;
 173     struct vcache *vcp = VTOAFS(fp->f_dentry->d_inode);
 174
 175     AFS_GLOCK();
 176     afs_Trace4(afs_iclSetp, CM_TRACE_READOP, ICL_TYPE_POINTER, vcp,
 177                ICL_TYPE_OFFSET, offp, ICL_TYPE_INT32, count, ICL_TYPE_INT32,
 178                99999);
 179     code = afs_linux_VerifyVCache(vcp, NULL);
 180
 181     if (code == 0) {
 182         /* Linux's FlushPages implementation doesn't ever use credp,
 183          * so we optimise by not using it */
 184         osi_FlushPages(vcp, NULL);      /* ensure stale pages are gone */
 185         AFS_GUNLOCK();
 186         code = do_sync_read(fp, buf, count, offp);
 187         AFS_GLOCK();
 188     }
 189
 190     afs_Trace4(afs_iclSetp, CM_TRACE_READOP, ICL_TYPE_POINTER, vcp,
 191                ICL_TYPE_OFFSET, offp, ICL_TYPE_INT32, count, ICL_TYPE_INT32,
 192                code);
 193     AFS_GUNLOCK();
 194     return code;
 195 }
 196 #endif
 197
 198
 199 /* Now we have integrated VM for writes as well as reads. the generic write operations
 200  * also take care of re-positioning the pointer if file is open in append
 201  * mode. Call fake open/close to ensure we do writes of core dumps.
 202  */
 203 #if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER) || defined(HAVE_LINUX_GENERIC_FILE_AIO_READ)
 204 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 205 static ssize_t
 206 afs_linux_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 207 # elif defined(LINUX_HAS_NONVECTOR_AIO)
 208 static ssize_t
 209 afs_linux_aio_write(struct kiocb *iocb, const char __user *buf, size_t bufsize,
 210                     loff_t pos)
 211 # else
 212 static ssize_t
 213 afs_linux_aio_write(struct kiocb *iocb, const struct iovec *buf,
 214                     unsigned long bufsize, loff_t pos)
 215 # endif
 216 {
 217     ssize_t code = 0;
 218     struct vcache *vcp = VTOAFS(iocb->ki_filp->f_dentry->d_inode);
 219     cred_t *credp;
 220 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 221     loff_t pos = iocb->ki_pos;
 222     unsigned long bufsize = iter->nr_segs;
 223 # endif
 224
 225     AFS_GLOCK();
 226
 227     afs_Trace4(afs_iclSetp, CM_TRACE_AIOWRITEOP, ICL_TYPE_POINTER, vcp,
 228                ICL_TYPE_OFFSET, ICL_HANDLE_OFFSET(pos), ICL_TYPE_INT32,
 229                (afs_int32)bufsize, ICL_TYPE_INT32,
 230                (iocb->ki_filp->f_flags & O_APPEND) ? 99998 : 99999);
 231
 232     code = afs_linux_VerifyVCache(vcp, &credp);
 233
 234     ObtainWriteLock(&vcp->lock, 529);
 235     afs_FakeOpen(vcp);
 236     ReleaseWriteLock(&vcp->lock);
 237     if (code == 0) {
 238             AFS_GUNLOCK();
 239 # if defined(STRUCT_FILE_OPERATIONS_HAS_READ_ITER)
 240             code = generic_file_write_iter(iocb, iter);
 241 # else
 242             code = generic_file_aio_write(iocb, buf, bufsize, pos);
 243 # endif
 244             AFS_GLOCK();
 245     }
 246
 247     ObtainWriteLock(&vcp->lock, 530);
 248
 249     if (vcp->execsOrWriters == 1 && !credp)
 250       credp = crref();
 251
 252     afs_FakeClose(vcp, credp);
 253     ReleaseWriteLock(&vcp->lock);
 254
 255     afs_Trace4(afs_iclSetp, CM_TRACE_AIOWRITEOP, ICL_TYPE_POINTER, vcp,
 256                ICL_TYPE_OFFSET, ICL_HANDLE_OFFSET(pos), ICL_TYPE_INT32,
 257                (afs_int32)bufsize, ICL_TYPE_INT32, code);
 258
 259     if (credp)
 260       crfree(credp);
 261     AFS_GUNLOCK();
 262     return code;
 263 }
 264 #else
 265 static ssize_t
 266 afs_linux_write(struct file *fp, const char *buf, size_t count, loff_t * offp)
 267 {
 268     ssize_t code = 0;
 269     struct vcache *vcp = VTOAFS(fp->f_dentry->d_inode);
 270     cred_t *credp;
 271
 272     AFS_GLOCK();
 273
 274     afs_Trace4(afs_iclSetp, CM_TRACE_WRITEOP, ICL_TYPE_POINTER, vcp,
 275                ICL_TYPE_OFFSET, offp, ICL_TYPE_INT32, count, ICL_TYPE_INT32,
 276                (fp->f_flags & O_APPEND) ? 99998 : 99999);
 277
 278     code = afs_linux_VerifyVCache(vcp, &credp);
 279
 280     ObtainWriteLock(&vcp->lock, 529);
 281     afs_FakeOpen(vcp);
 282     ReleaseWriteLock(&vcp->lock);
 283     if (code == 0) {
 284             AFS_GUNLOCK();
 285             code = do_sync_write(fp, buf, count, offp);
 286             AFS_GLOCK();
 287     }
 288
 289     ObtainWriteLock(&vcp->lock, 530);
 290
 291     if (vcp->execsOrWriters == 1 && !credp)
 292       credp = crref();
 293
 294     afs_FakeClose(vcp, credp);
 295     ReleaseWriteLock(&vcp->lock);
 296
 297     afs_Trace4(afs_iclSetp, CM_TRACE_WRITEOP, ICL_TYPE_POINTER, vcp,
 298                ICL_TYPE_OFFSET, offp, ICL_TYPE_INT32, count, ICL_TYPE_INT32,
 299                code);
 300
 301     if (credp)
 302       crfree(credp);
 303     AFS_GUNLOCK();
 304     return code;
 305 }
 306 #endif
 307
 308 extern int BlobScan(struct dcache * afile, afs_int32 ablob, afs_int32 *ablobOut);
 309
 310 /* This is a complete rewrite of afs_readdir, since we can make use of
 311  * filldir instead of afs_readdir_move. Note that changes to vcache/dcache
 312  * handling and use of bulkstats will need to be reflected here as well.
 313  */
 314 static int
 315 #if defined(USE_FOP_ITERATE)
 316 afs_linux_readdir(struct file *fp, struct dir_context *ctx)
 317 #else
 318 afs_linux_readdir(struct file *fp, void *dirbuf, filldir_t filldir)
 319 #endif
 320 {
 321     struct vcache *avc = VTOAFS(FILE_INODE(fp));
 322     struct vrequest *treq = NULL;
 323     struct dcache *tdc;
 324     int code;
 325     int offset;
 326     afs_int32 dirpos;
 327     struct DirEntry *de;
 328     struct DirBuffer entry;
 329     ino_t ino;
 330     int len;
 331     afs_size_t origOffset, tlen;
 332     cred_t *credp = crref();
 333     struct afs_fakestat_state fakestat;
 334
 335     AFS_GLOCK();
 336     AFS_STATCNT(afs_readdir);
 337
 338     code = afs_convert_code(afs_CreateReq(&treq, credp));
 339     crfree(credp);
 340     if (code)
 341         goto out1;
 342
 343     afs_InitFakeStat(&fakestat);
 344     code = afs_convert_code(afs_EvalFakeStat(&avc, &fakestat, treq));
 345     if (code)
 346         goto out;
 347
 348     /* update the cache entry */
 349   tagain:
 350     code = afs_convert_code(afs_VerifyVCache2(avc, treq));
 351     if (code)
 352         goto out;
 353
 354     /* get a reference to the entire directory */
 355     tdc = afs_GetDCache(avc, (afs_size_t) 0, treq, &origOffset, &tlen, 1);
 356     len = tlen;
 357     if (!tdc) {
 358         code = -EIO;
 359         goto out;
 360     }
 361     ObtainWriteLock(&avc->lock, 811);
 362     ObtainReadLock(&tdc->lock);
 363     /*
 364      * Make sure that the data in the cache is current. There are two
 365      * cases we need to worry about:
 366      * 1. The cache data is being fetched by another process.
 367      * 2. The cache data is no longer valid
 368      */
 369     while ((avc->f.states & CStatd)
 370            && (tdc->dflags & DFFetching)
 371            && hsame(avc->f.m.DataVersion, tdc->f.versionNo)) {
 372         ReleaseReadLock(&tdc->lock);
 373         ReleaseWriteLock(&avc->lock);
 374         afs_osi_Sleep(&tdc->validPos);
 375         ObtainWriteLock(&avc->lock, 812);
 376         ObtainReadLock(&tdc->lock);
 377     }
 378     if (!(avc->f.states & CStatd)
 379         || !hsame(avc->f.m.DataVersion, tdc->f.versionNo)) {
 380         ReleaseReadLock(&tdc->lock);
 381         ReleaseWriteLock(&avc->lock);
 382         afs_PutDCache(tdc);
 383         goto tagain;
 384     }
 385
 386     /* Set the readdir-in-progress flag, and downgrade the lock
 387      * to shared so others will be able to acquire a read lock.
 388      */
 389     avc->f.states |= CReadDir;
 390     avc->dcreaddir = tdc;
 391     avc->readdir_pid = MyPidxx2Pid(MyPidxx);
 392     ConvertWToSLock(&avc->lock);
 393
 394     /* Fill in until we get an error or we're done. This implementation
 395      * takes an offset in units of blobs, rather than bytes.
 396      */
 397     code = 0;
 398 #if defined(USE_FOP_ITERATE)
 399     offset = ctx->pos;
 400 #else
 401     offset = (int) fp->f_pos;
 402 #endif
 403     while (1) {
 404         code = BlobScan(tdc, offset, &dirpos);
 405         if (code || !dirpos)
 406             break;
 407
 408         code = afs_dir_GetVerifiedBlob(tdc, dirpos, &entry);
 409         if (code) {
 410             if (!(avc->f.states & CCorrupt)) {
 411                 struct cell *tc = afs_GetCellStale(avc->f.fid.Cell, READ_LOCK);
 412                 afs_warn("afs: Corrupt directory (%d.%d.%d.%d [%s] @%lx, pos %d)\n",
 413                          avc->f.fid.Cell, avc->f.fid.Fid.Volume,
 414                          avc->f.fid.Fid.Vnode, avc->f.fid.Fid.Unique,
 415                          tc ? tc->cellName : "",
 416                          (unsigned long)&tdc->f.inode, dirpos);
 417                 if (tc)
 418                     afs_PutCell(tc, READ_LOCK);
 419                 UpgradeSToWLock(&avc->lock, 814);
 420                 avc->f.states |= CCorrupt;
 421             }
 422             code = -EIO;
 423             goto unlock_out;
 424         }
 425
 426         de = (struct DirEntry *)entry.data;
 427         ino = afs_calc_inum (avc->f.fid.Cell, avc->f.fid.Fid.Volume,
 428                              ntohl(de->fid.vnode));
 429         len = strlen(de->name);
 430
 431         /* filldir returns -EINVAL when the buffer is full. */
 432         {
 433             unsigned int type = DT_UNKNOWN;
 434             struct VenusFid afid;
 435             struct vcache *tvc;
 436             int vtype;
 437             afid.Cell = avc->f.fid.Cell;
 438             afid.Fid.Volume = avc->f.fid.Fid.Volume;
 439             afid.Fid.Vnode = ntohl(de->fid.vnode);
 440             afid.Fid.Unique = ntohl(de->fid.vunique);
 441             if ((avc->f.states & CForeign) == 0 && (ntohl(de->fid.vnode) & 1)) {
 442                 type = DT_DIR;
 443             } else if ((tvc = afs_FindVCache(&afid, 0, 0))) {
 444                 if (tvc->mvstat != AFS_MVSTAT_FILE) {
 445                     type = DT_DIR;
 446                 } else if (((tvc->f.states) & (CStatd | CTruth))) {
 447                     /* CTruth will be set if the object has
 448                      *ever* been statd */
 449                     vtype = vType(tvc);
 450                     if (vtype == VDIR)
 451                         type = DT_DIR;
 452                     else if (vtype == VREG)
 453                         type = DT_REG;
 454                     /* Don't do this until we're sure it can't be a mtpt */
 455                     /* else if (vtype == VLNK)
 456                      * type=DT_LNK; */
 457                     /* what other types does AFS support? */
 458                 }
 459                 /* clean up from afs_FindVCache */
 460                 afs_PutVCache(tvc);
 461             }
 462             /*
 463              * If this is NFS readdirplus, then the filler is going to
 464              * call getattr on this inode, which will deadlock if we're
 465              * holding the GLOCK.
 466              */
 467             AFS_GUNLOCK();
 468 #if defined(USE_FOP_ITERATE)
 469             /* dir_emit returns a bool - true when it succeeds.
 470              * Inverse the result to fit with how we check "code" */
 471             code = !dir_emit(ctx, de->name, len, ino, type);
 472 #else
 473             code = (*filldir) (dirbuf, de->name, len, offset, ino, type);
 474 #endif
 475             AFS_GLOCK();
 476         }
 477         DRelease(&entry, 0);
 478         if (code)
 479             break;
 480         offset = dirpos + 1 + ((len + 16) >> 5);
 481     }
 482     /* If filldir didn't fill in the last one this is still pointing to that
 483      * last attempt.
 484      */
 485     code = 0;
 486
 487 unlock_out:
 488 #if defined(USE_FOP_ITERATE)
 489     ctx->pos = (loff_t) offset;
 490 #else
 491     fp->f_pos = (loff_t) offset;
 492 #endif
 493     ReleaseReadLock(&tdc->lock);
 494     afs_PutDCache(tdc);
 495     UpgradeSToWLock(&avc->lock, 813);
 496     avc->f.states &= ~CReadDir;
 497     avc->dcreaddir = 0;
 498     avc->readdir_pid = 0;
 499     ReleaseSharedLock(&avc->lock);
 500
 501 out:
 502     afs_PutFakeStat(&fakestat);
 503     afs_DestroyReq(treq);
 504 out1:
 505     AFS_GUNLOCK();
 506     return code;
 507 }
 508
 509
 510 /* in afs_pioctl.c */
 511 extern int afs_xioctl(struct inode *ip, struct file *fp, unsigned int com,
 512                       unsigned long arg);
 513
 514 #if defined(HAVE_UNLOCKED_IOCTL) || defined(HAVE_COMPAT_IOCTL)
 515 static long afs_unlocked_xioctl(struct file *fp, unsigned int com,
 516                                unsigned long arg) {
 517     return afs_xioctl(FILE_INODE(fp), fp, com, arg);
 518
 519 }
 520 #endif
 521
 522
 523 static int
 524 afs_linux_mmap(struct file *fp, struct vm_area_struct *vmap)
 525 {
 526     struct vcache *vcp = VTOAFS(FILE_INODE(fp));
 527     int code;
 528
 529     AFS_GLOCK();
 530     afs_Trace3(afs_iclSetp, CM_TRACE_GMAP, ICL_TYPE_POINTER, vcp,
 531                ICL_TYPE_POINTER, vmap->vm_start, ICL_TYPE_INT32,
 532                vmap->vm_end - vmap->vm_start);
 533
 534     /* get a validated vcache entry */
 535     code = afs_linux_VerifyVCache(vcp, NULL);
 536
 537     if (code == 0) {
 538         /* Linux's Flushpage implementation doesn't use credp, so optimise
 539          * our code to not need to crref() it */
 540         osi_FlushPages(vcp, NULL); /* ensure stale pages are gone */
 541         AFS_GUNLOCK();
 542         code = generic_file_mmap(fp, vmap);
 543         AFS_GLOCK();
 544         if (!code)
 545             vcp->f.states |= CMAPPED;
 546     }
 547     AFS_GUNLOCK();
 548
 549     return code;
 550 }
 551
 552 static int
 553 afs_linux_open(struct inode *ip, struct file *fp)
 554 {
 555     struct vcache *vcp = VTOAFS(ip);
 556     cred_t *credp = crref();
 557     int code;
 558
 559     AFS_GLOCK();
 560     code = afs_open(&vcp, fp->f_flags, credp);
 561     AFS_GUNLOCK();
 562
 563     crfree(credp);
 564     return afs_convert_code(code);
 565 }
 566
 567 static int
 568 afs_linux_release(struct inode *ip, struct file *fp)
 569 {
 570     struct vcache *vcp = VTOAFS(ip);
 571     cred_t *credp = crref();
 572     int code = 0;
 573
 574     AFS_GLOCK();
 575     code = afs_close(vcp, fp->f_flags, credp);
 576     ObtainWriteLock(&vcp->lock, 807);
 577     if (vcp->cred) {
 578         crfree(vcp->cred);
 579         vcp->cred = NULL;
 580     }
 581     ReleaseWriteLock(&vcp->lock);
 582     AFS_GUNLOCK();
 583
 584     crfree(credp);
 585     return afs_convert_code(code);
 586 }
 587
 588 static int
 589 #if defined(FOP_FSYNC_TAKES_DENTRY)
 590 afs_linux_fsync(struct file *fp, struct dentry *dp, int datasync)
 591 #elif defined(FOP_FSYNC_TAKES_RANGE)
 592 afs_linux_fsync(struct file *fp, loff_t start, loff_t end, int datasync)
 593 #else
 594 afs_linux_fsync(struct file *fp, int datasync)
 595 #endif
 596 {
 597     int code;
 598     struct inode *ip = FILE_INODE(fp);
 599     cred_t *credp = crref();
 600
 601 #if defined(FOP_FSYNC_TAKES_RANGE)
 602     afs_linux_lock_inode(ip);
 603 #endif
 604     AFS_GLOCK();
 605     code = afs_fsync(VTOAFS(ip), credp);
 606     AFS_GUNLOCK();
 607 #if defined(FOP_FSYNC_TAKES_RANGE)
 608     afs_linux_unlock_inode(ip);
 609 #endif
 610     crfree(credp);
 611     return afs_convert_code(code);
 612
 613 }
 614
 615
 616 static int
 617 afs_linux_lock(struct file *fp, int cmd, struct file_lock *flp)
 618 {
 619     int code = 0;
 620     struct vcache *vcp = VTOAFS(FILE_INODE(fp));
 621     cred_t *credp = crref();
 622     struct AFS_FLOCK flock;
 623
 624     /* Convert to a lock format afs_lockctl understands. */
 625     memset(&flock, 0, sizeof(flock));
 626     flock.l_type = flp->fl_type;
 627     flock.l_pid = flp->fl_pid;
 628     flock.l_whence = 0;
 629     flock.l_start = flp->fl_start;
 630     if (flp->fl_end == OFFSET_MAX)
 631         flock.l_len = 0; /* Lock to end of file */
 632     else
 633         flock.l_len = flp->fl_end - flp->fl_start + 1;
 634
 635     /* Safe because there are no large files, yet */
 636 #if defined(F_GETLK64) && (F_GETLK != F_GETLK64)
 637     if (cmd == F_GETLK64)
 638         cmd = F_GETLK;
 639     else if (cmd == F_SETLK64)
 640         cmd = F_SETLK;
 641     else if (cmd == F_SETLKW64)
 642         cmd = F_SETLKW;
 643 #endif /* F_GETLK64 && F_GETLK != F_GETLK64 */
 644
 645     AFS_GLOCK();
 646     code = afs_convert_code(afs_lockctl(vcp, &flock, cmd, credp));
 647     AFS_GUNLOCK();
 648
 649     if ((code == 0 || flp->fl_type == F_UNLCK) &&
 650         (cmd == F_SETLK || cmd == F_SETLKW)) {
 651         code = afs_posix_lock_file(fp, flp);
 652         if (code && flp->fl_type != F_UNLCK) {
 653             struct AFS_FLOCK flock2;
 654             flock2 = flock;
 655             flock2.l_type = F_UNLCK;
 656             AFS_GLOCK();
 657             afs_lockctl(vcp, &flock2, F_SETLK, credp);
 658             AFS_GUNLOCK();
 659         }
 660     }
 661     /* If lockctl says there are no conflicting locks, then also check with the
 662      * kernel, as lockctl knows nothing about byte range locks
 663      */
 664     if (code == 0 && cmd == F_GETLK && flock.l_type == F_UNLCK) {
 665         afs_posix_test_lock(fp, flp);
 666         /* If we found a lock in the kernel's structure, return it */
 667         if (flp->fl_type != F_UNLCK) {
 668             crfree(credp);
 669             return 0;
 670         }
 671     }
 672
 673     /* Convert flock back to Linux's file_lock */
 674     flp->fl_type = flock.l_type;
 675     flp->fl_pid = flock.l_pid;
 676     flp->fl_start = flock.l_start;
 677     if (flock.l_len == 0)
 678         flp->fl_end = OFFSET_MAX; /* Lock to end of file */
 679     else
 680         flp->fl_end = flock.l_start + flock.l_len - 1;
 681
 682     crfree(credp);
 683     return code;
 684 }
 685
 686 #ifdef STRUCT_FILE_OPERATIONS_HAS_FLOCK
 687 static int
 688 afs_linux_flock(struct file *fp, int cmd, struct file_lock *flp) {
 689     int code = 0;
 690     struct vcache *vcp = VTOAFS(FILE_INODE(fp));
 691     cred_t *credp = crref();
 692     struct AFS_FLOCK flock;
 693     /* Convert to a lock format afs_lockctl understands. */
 694     memset(&flock, 0, sizeof(flock));
 695     flock.l_type = flp->fl_type;
 696     flock.l_pid = flp->fl_pid;
 697     flock.l_whence = 0;
 698     flock.l_start = 0;
 699     flock.l_len = 0;
 700
 701     /* Safe because there are no large files, yet */
 702 #if defined(F_GETLK64) && (F_GETLK != F_GETLK64)
 703     if (cmd == F_GETLK64)
 704         cmd = F_GETLK;
 705     else if (cmd == F_SETLK64)
 706         cmd = F_SETLK;
 707     else if (cmd == F_SETLKW64)
 708         cmd = F_SETLKW;
 709 #endif /* F_GETLK64 && F_GETLK != F_GETLK64 */
 710
 711     AFS_GLOCK();
 712     code = afs_convert_code(afs_lockctl(vcp, &flock, cmd, credp));
 713     AFS_GUNLOCK();
 714
 715     if ((code == 0 || flp->fl_type == F_UNLCK) &&
 716         (cmd == F_SETLK || cmd == F_SETLKW)) {
 717         flp->fl_flags &=~ FL_SLEEP;
 718         code = flock_lock_file_wait(fp, flp);
 719         if (code && flp->fl_type != F_UNLCK) {
 720             struct AFS_FLOCK flock2;
 721             flock2 = flock;
 722             flock2.l_type = F_UNLCK;
 723             AFS_GLOCK();
 724             afs_lockctl(vcp, &flock2, F_SETLK, credp);
 725             AFS_GUNLOCK();
 726         }
 727     }
 728     /* Convert flock back to Linux's file_lock */
 729     flp->fl_type = flock.l_type;
 730     flp->fl_pid = flock.l_pid;
 731
 732     crfree(credp);
 733     return code;
 734 }
 735 #endif
 736
 737 /* afs_linux_flush
 738  * essentially the same as afs_fsync() but we need to get the return
 739  * code for the sys_close() here, not afs_linux_release(), so call
 740  * afs_StoreAllSegments() with AFS_LASTSTORE
 741  */
 742 static int
 743 #if defined(FOP_FLUSH_TAKES_FL_OWNER_T)
 744 afs_linux_flush(struct file *fp, fl_owner_t id)
 745 #else
 746 afs_linux_flush(struct file *fp)
 747 #endif
 748 {
 749     struct vrequest *treq = NULL;
 750     struct vcache *vcp;
 751     cred_t *credp;
 752     int code;
 753     int bypasscache = 0;
 754
 755     AFS_GLOCK();
 756
 757     if ((fp->f_flags & O_ACCMODE) == O_RDONLY) { /* readers dont flush */
 758         AFS_GUNLOCK();
 759         return 0;
 760     }
 761
 762     AFS_DISCON_LOCK();
 763
 764     credp = crref();
 765     vcp = VTOAFS(FILE_INODE(fp));
 766
 767     code = afs_CreateReq(&treq, credp);
 768     if (code)
 769         goto out;
 770     /* If caching is bypassed for this file, or globally, just return 0 */
 771     if (cache_bypass_strategy == ALWAYS_BYPASS_CACHE)
 772         bypasscache = 1;
 773     else {
 774         ObtainReadLock(&vcp->lock);
 775         if (vcp->cachingStates & FCSBypass)
 776             bypasscache = 1;
 777         ReleaseReadLock(&vcp->lock);
 778     }
 779     if (bypasscache) {
 780         /* future proof: don't rely on 0 return from afs_InitReq */
 781         code = 0;
 782         goto out;
 783     }
 784
 785     ObtainSharedLock(&vcp->lock, 535);
 786     if ((vcp->execsOrWriters > 0) && (file_count(fp) == 1)) {
 787         UpgradeSToWLock(&vcp->lock, 536);
 788         if (!AFS_IS_DISCONNECTED) {
 789                 code = afs_StoreAllSegments(vcp,
 790                                 treq,
 791                                 AFS_SYNC | AFS_LASTSTORE);
 792         } else {
 793                 afs_DisconAddDirty(vcp, VDisconWriteOsiFlush, 1);
 794         }
 795         ConvertWToSLock(&vcp->lock);
 796     }
 797     code = afs_CheckCode(code, treq, 54);
 798     ReleaseSharedLock(&vcp->lock);
 799
 800 out:
 801     afs_DestroyReq(treq);
 802     AFS_DISCON_UNLOCK();
 803     AFS_GUNLOCK();
 804
 805     crfree(credp);
 806     return afs_convert_code(code);
 807 }
 808
 809 struct file_operations afs_dir_fops = {
 810   .read =       generic_read_dir,
 811 #if defined(USE_FOP_ITERATE)
 812   .iterate =    afs_linux_readdir,
 813 #else
 814   .readdir =    afs_linux_readdir,
 815 #endif
 816 #ifdef HAVE_UNLOCKED_IOCTL
 817   .unlocked_ioctl = afs_unlocked_xioctl,
 818 #else
 819   .ioctl =      afs_xioctl,
 820 #endif
 821 #ifdef HAVE_COMPAT_IOCTL
 822   .compat_ioctl = afs_unlocked_xioctl,
 823 #endif
 824   .open =       afs_linux_open,
 825   .release =    afs_linux_release,
 826   .llseek =     default_llseek,
 827 #ifdef HAVE_LINUX_NOOP_FSYNC
 828   .fsync =      noop_fsync,
 829 #else
 830   .fsync =      simple_sync_file,
 831 #endif
 832 };
 833
 834 struct file_operations afs_file_fops = {
 835 #ifdef STRUCT_FILE_OPERATIONS_HAS_READ_ITER
 836   .read_iter =  afs_linux_read_iter,
 837   .write_iter = afs_linux_write_iter,
 838 # if !defined(HAVE_LINUX___VFS_WRITE) && !defined(HAVE_LINUX_KERNEL_WRITE)
 839   .read =       new_sync_read,
 840   .write =      new_sync_write,
 841 # endif
 842 #elif defined(HAVE_LINUX_GENERIC_FILE_AIO_READ)
 843   .aio_read =   afs_linux_aio_read,
 844   .aio_write =  afs_linux_aio_write,
 845   .read =       do_sync_read,
 846   .write =      do_sync_write,
 847 #else
 848   .read =       afs_linux_read,
 849   .write =      afs_linux_write,
 850 #endif
 851 #ifdef HAVE_UNLOCKED_IOCTL
 852   .unlocked_ioctl = afs_unlocked_xioctl,
 853 #else
 854   .ioctl =      afs_xioctl,
 855 #endif
 856 #ifdef HAVE_COMPAT_IOCTL
 857   .compat_ioctl = afs_unlocked_xioctl,
 858 #endif
 859   .mmap =       afs_linux_mmap,
 860   .open =       afs_linux_open,
 861   .flush =      afs_linux_flush,
 862 #if defined(STRUCT_FILE_OPERATIONS_HAS_SENDFILE)
 863   .sendfile =   generic_file_sendfile,
 864 #endif
 865 #if defined(STRUCT_FILE_OPERATIONS_HAS_SPLICE) && !defined(HAVE_LINUX_DEFAULT_FILE_SPLICE_READ)
 866 # if defined(HAVE_LINUX_ITER_FILE_SPLICE_WRITE)
 867   .splice_write = iter_file_splice_write,
 868 # else
 869   .splice_write = generic_file_splice_write,
 870 # endif
 871   .splice_read = generic_file_splice_read,
 872 #endif
 873   .release =    afs_linux_release,
 874   .fsync =      afs_linux_fsync,
 875   .lock =       afs_linux_lock,
 876 #ifdef STRUCT_FILE_OPERATIONS_HAS_FLOCK
 877   .flock =      afs_linux_flock,
 878 #endif
 879   .llseek =     default_llseek,
 880 };
 881
 882 static struct dentry *
 883 canonical_dentry(struct inode *ip)
 884 {
 885     struct vcache *vcp = VTOAFS(ip);
 886     struct dentry *first = NULL, *ret = NULL, *cur;
 887 #if defined(D_ALIAS_IS_HLIST) && !defined(HLIST_ITERATOR_NO_NODE)
 888     struct hlist_node *p;
 889 #endif
 890
 891     /* general strategy:
 892      * if vcp->target_link is set, and can be found in ip->i_dentry, use that.
 893      * otherwise, use the first dentry in ip->i_dentry.
 894      * if ip->i_dentry is empty, use the 'dentry' argument we were given.
 895      */
 896     /* note that vcp->target_link specifies which dentry to use, but we have
 897      * no reference held on that dentry. so, we cannot use or dereference
 898      * vcp->target_link itself, since it may have been freed. instead, we only
 899      * use it to compare to pointers in the ip->i_dentry list. */
 900
 901     d_prune_aliases(ip);
 902
 903     afs_d_alias_lock(ip);
 904
 905 #if defined(D_ALIAS_IS_HLIST)
 906 # if defined(HLIST_ITERATOR_NO_NODE)
 907     hlist_for_each_entry(cur, &ip->i_dentry, d_alias) {
 908 # else
 909     hlist_for_each_entry(cur, p, &ip->i_dentry, d_alias) {
 910 # endif
 911 #else
 912     list_for_each_entry_reverse(cur, &ip->i_dentry, d_alias) {
 913 #endif
 914
 915         if (!vcp->target_link || cur == vcp->target_link) {
 916             ret = cur;
 917             break;
 918         }
 919
 920         if (!first) {
 921             first = cur;
 922         }
 923     }
 924     if (!ret && first) {
 925         ret = first;
 926     }
 927
 928     vcp->target_link = ret;
 929
 930     if (ret) {
 931         afs_linux_dget(ret);
 932     }
 933     afs_d_alias_unlock(ip);
 934
 935     return ret;
 936 }
 937
 938 /**********************************************************************
 939  * AFS Linux dentry operations
 940  **********************************************************************/
 941
 942 /* afs_linux_revalidate
 943  * Ensure vcache is stat'd before use. Return 0 if entry is valid.
 944  */
 945 static int
 946 afs_linux_revalidate(struct dentry *dp)
 947 {
 948     struct vattr *vattr = NULL;
 949     struct vcache *vcp = VTOAFS(dp->d_inode);
 950     cred_t *credp;
 951     int code;
 952
 953     if (afs_shuttingdown != AFS_RUNNING)
 954         return EIO;
 955
 956     AFS_GLOCK();
 957
 958     code = afs_CreateAttr(&vattr);
 959     if (code) {
 960         goto out;
 961     }
 962
 963     /* This avoids the crref when we don't have to do it. Watch for
 964      * changes in afs_getattr that don't get replicated here!
 965      */
 966     if (vcp->f.states & CStatd &&
 967         (!afs_fakestat_enable || vcp->mvstat != AFS_MVSTAT_MTPT) &&
 968         !afs_nfsexporter &&
 969         (vType(vcp) == VDIR || vType(vcp) == VLNK)) {
 970         code = afs_CopyOutAttrs(vcp, vattr);
 971     } else {
 972         credp = crref();
 973         code = afs_getattr(vcp, vattr, credp);
 974         crfree(credp);
 975     }
 976
 977     if (!code)
 978         afs_fill_inode(AFSTOV(vcp), vattr);
 979
 980     afs_DestroyAttr(vattr);
 981
 982 out:
 983     AFS_GUNLOCK();
 984
 985     return afs_convert_code(code);
 986 }
 987
 988 /* vattr_setattr
 989  * Set iattr data into vattr. Assume vattr cleared before call.
 990  */
 991 static void
 992 iattr2vattr(struct vattr *vattrp, struct iattr *iattrp)
 993 {
 994     vattrp->va_mask = iattrp->ia_valid;
 995     if (iattrp->ia_valid & ATTR_MODE)
 996         vattrp->va_mode = iattrp->ia_mode;
 997     if (iattrp->ia_valid & ATTR_UID)
 998         vattrp->va_uid = afs_from_kuid(iattrp->ia_uid);
 999     if (iattrp->ia_valid & ATTR_GID)
1000         vattrp->va_gid = afs_from_kgid(iattrp->ia_gid);
1001     if (iattrp->ia_valid & ATTR_SIZE)
1002         vattrp->va_size = iattrp->ia_size;
1003     if (iattrp->ia_valid & ATTR_ATIME) {
1004         vattrp->va_atime.tv_sec = iattrp->ia_atime.tv_sec;
1005         vattrp->va_atime.tv_usec = 0;
1006     }
1007     if (iattrp->ia_valid & ATTR_MTIME) {
1008         vattrp->va_mtime.tv_sec = iattrp->ia_mtime.tv_sec;
1009         vattrp->va_mtime.tv_usec = 0;
1010     }
1011     if (iattrp->ia_valid & ATTR_CTIME) {
1012         vattrp->va_ctime.tv_sec = iattrp->ia_ctime.tv_sec;
1013         vattrp->va_ctime.tv_usec = 0;
1014     }
1015 }
1016
1017 /* vattr2inode
1018  * Rewrite the inode cache from the attr. Assumes all vattr fields are valid.
1019  */
1020 void
1021 vattr2inode(struct inode *ip, struct vattr *vp)
1022 {
1023     ip->i_ino = vp->va_nodeid;
1024 #ifdef HAVE_LINUX_SET_NLINK
1025     set_nlink(ip, vp->va_nlink);
1026 #else
1027     ip->i_nlink = vp->va_nlink;
1028 #endif
1029     ip->i_blocks = vp->va_blocks;
1030 #ifdef STRUCT_INODE_HAS_I_BLKBITS
1031     ip->i_blkbits = AFS_BLKBITS;
1032 #endif
1033 #ifdef STRUCT_INODE_HAS_I_BLKSIZE
1034     ip->i_blksize = vp->va_blocksize;
1035 #endif
1036     ip->i_rdev = vp->va_rdev;
1037     ip->i_mode = vp->va_mode;
1038     ip->i_uid = afs_make_kuid(vp->va_uid);
1039     ip->i_gid = afs_make_kgid(vp->va_gid);
1040     i_size_write(ip, vp->va_size);
1041     ip->i_atime.tv_sec = vp->va_atime.tv_sec;
1042     ip->i_atime.tv_nsec = 0;
1043     ip->i_mtime.tv_sec = vp->va_mtime.tv_sec;
1044     /* Set the mtime nanoseconds to the sysname generation number.
1045      * This convinces NFS clients that all directories have changed
1046      * any time the sysname list changes.
1047      */
1048     ip->i_mtime.tv_nsec = afs_sysnamegen;
1049     ip->i_ctime.tv_sec = vp->va_ctime.tv_sec;
1050     ip->i_ctime.tv_nsec = 0;
1051 }
1052
1053 /* afs_notify_change
1054  * Linux version of setattr call. What to change is in the iattr struct.
1055  * We need to set bits in both the Linux inode as well as the vcache.
1056  */
1057 static int
1058 afs_notify_change(struct dentry *dp, struct iattr *iattrp)
1059 {
1060     struct vattr *vattr = NULL;
1061     cred_t *credp = crref();
1062     struct inode *ip = dp->d_inode;
1063     int code;
1064
1065     AFS_GLOCK();
1066     code = afs_CreateAttr(&vattr);
1067     if (code) {
1068         goto out;
1069     }
1070
1071     iattr2vattr(vattr, iattrp); /* Convert for AFS vnodeops call. */
1072
1073     code = afs_setattr(VTOAFS(ip), vattr, credp);
1074     if (!code) {
1075         afs_getattr(VTOAFS(ip), vattr, credp);
1076         vattr2inode(ip, vattr);
1077     }
1078     afs_DestroyAttr(vattr);
1079
1080 out:
1081     AFS_GUNLOCK();
1082     crfree(credp);
1083     return afs_convert_code(code);
1084 }
1085
1086 #if defined(IOP_GETATTR_TAKES_PATH_STRUCT)
1087 static int
1088 afs_linux_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int sync_mode)
1089 {
1090         int err = afs_linux_revalidate(path->dentry);
1091         if (!err) {
1092                 generic_fillattr(path->dentry->d_inode, stat);
1093         }
1094         return err;
1095 }
1096 #else
1097 static int
1098 afs_linux_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1099 {
1100         int err = afs_linux_revalidate(dentry);
1101         if (!err) {
1102                 generic_fillattr(dentry->d_inode, stat);
1103         }
1104         return err;
1105 }
1106 #endif
1107
1108 static afs_uint32
1109 parent_vcache_dv(struct inode *inode, cred_t *credp)
1110 {
1111     int free_cred = 0;
1112     struct vcache *pvcp;
1113
1114     /*
1115      * If parent is a mount point and we are using fakestat, we may need
1116      * to look at the fake vcache entry instead of what the vfs is giving
1117      * us.  The fake entry is the one with the useful DataVersion.
1118      */
1119     pvcp = VTOAFS(inode);
1120     if (pvcp->mvstat == AFS_MVSTAT_MTPT && afs_fakestat_enable) {
1121         struct vrequest treq;
1122         struct afs_fakestat_state fakestate;
1123
1124         if (!credp) {
1125             credp = crref();
1126             free_cred = 1;
1127         }
1128         afs_InitReq(&treq, credp);
1129         afs_InitFakeStat(&fakestate);
1130         afs_TryEvalFakeStat(&pvcp, &fakestate, &treq);
1131         if (free_cred)
1132             crfree(credp);
1133         afs_PutFakeStat(&fakestate);
1134     }
1135     return hgetlo(pvcp->f.m.DataVersion);
1136 }
1137
1138 #ifndef D_SPLICE_ALIAS_RACE
1139
1140 static inline void dentry_race_lock(void) {}
1141 static inline void dentry_race_unlock(void) {}
1142
1143 #else
1144
1145 # if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
1146 static DEFINE_MUTEX(dentry_race_sem);
1147 # else
1148 static DECLARE_MUTEX(dentry_race_sem);
1149 # endif
1150
1151 static inline void
1152 dentry_race_lock(void)
1153 {
1154     mutex_lock(&dentry_race_sem);
1155 }
1156 static inline void
1157 dentry_race_unlock(void)
1158 {
1159     mutex_unlock(&dentry_race_sem);
1160 }
1161
1162 /* Leave some trace that this code is enabled; otherwise it's pretty hard to
1163  * tell. */
1164 static __attribute__((used)) const char dentry_race_marker[] = "d_splice_alias race workaround enabled";
1165
1166 static int
1167 check_dentry_race(struct dentry *dp)
1168 {
1169     int raced = 0;
1170     if (!dp->d_inode) {
1171         /* In Linux, before commit 4919c5e45a91b5db5a41695fe0357fbdff0d5767,
1172          * d_splice_alias can momentarily hash a dentry before it's fully
1173          * populated. This only happens for a moment, since it's unhashed again
1174          * right after (in d_move), but this can make the dentry be found by
1175          * __d_lookup, and then given to us.
1176          *
1177          * So check if the dentry is unhashed; if it is, then the dentry is not
1178          * valid. We lock dentry_race_lock() to ensure that d_splice_alias is
1179          * no longer running. Locking d_lock is required to check the dentry's
1180          * flags, so lock that, too.
1181          */
1182         dentry_race_lock();
1183         spin_lock(&dp->d_lock);
1184         if (d_unhashed(dp)) {
1185             raced = 1;
1186         }
1187         spin_unlock(&dp->d_lock);
1188         dentry_race_unlock();
1189     }
1190     return raced;
1191 }
1192 #endif /* D_SPLICE_ALIAS_RACE */
1193
1194 /* Validate a dentry. Return 1 if unchanged, 0 if VFS layer should re-evaluate.
1195  * In kernels 2.2.10 and above, we are passed an additional flags var which
1196  * may have either the LOOKUP_FOLLOW OR LOOKUP_DIRECTORY set in which case
1197  * we are advised to follow the entry if it is a link or to make sure that
1198  * it is a directory. But since the kernel itself checks these possibilities
1199  * later on, we shouldn't have to do it until later. Perhaps in the future..
1200  *
1201  * The code here assumes that on entry the global lock is not held
1202  */
1203 static int
1204 #if defined(DOP_REVALIDATE_TAKES_UNSIGNED)
1205 afs_linux_dentry_revalidate(struct dentry *dp, unsigned int flags)
1206 #elif defined(DOP_REVALIDATE_TAKES_NAMEIDATA)
1207 afs_linux_dentry_revalidate(struct dentry *dp, struct nameidata *nd)
1208 #else
1209 afs_linux_dentry_revalidate(struct dentry *dp, int flags)
1210 #endif
1211 {
1212     cred_t *credp = NULL;
1213     struct vcache *vcp, *pvcp, *tvc = NULL;
1214     struct dentry *parent;
1215     int valid;
1216     struct afs_fakestat_state fakestate;
1217     int force_drop = 0;
1218     afs_uint32 parent_dv;
1219
1220 #ifdef LOOKUP_RCU
1221     /* We don't support RCU path walking */
1222 # if defined(DOP_REVALIDATE_TAKES_UNSIGNED)
1223     if (flags & LOOKUP_RCU)
1224 # else
1225     if (nd->flags & LOOKUP_RCU)
1226 # endif
1227        return -ECHILD;
1228 #endif
1229
1230 #ifdef D_SPLICE_ALIAS_RACE
1231     if (check_dentry_race(dp)) {
1232         valid = 0;
1233         return valid;
1234     }
1235 #endif
1236
1237     AFS_GLOCK();
1238     afs_InitFakeStat(&fakestate);
1239
1240     if (dp->d_inode) {
1241         vcp = VTOAFS(dp->d_inode);
1242
1243         if (vcp == afs_globalVp)
1244             goto good_dentry;
1245
1246         if (vcp->mvstat == AFS_MVSTAT_MTPT) {
1247             if (vcp->mvid.target_root && (vcp->f.states & CMValid)) {
1248                 int tryEvalOnly = 0;
1249                 int code = 0;
1250                 struct vrequest *treq = NULL;
1251
1252                 credp = crref();
1253
1254                 code = afs_CreateReq(&treq, credp);
1255                 if (code) {
1256                     goto bad_dentry;
1257                 }
1258                 if ((strcmp(dp->d_name.name, ".directory") == 0)) {
1259                     tryEvalOnly = 1;
1260                 }
1261                 if (tryEvalOnly)
1262                     code = afs_TryEvalFakeStat(&vcp, &fakestate, treq);
1263                 else
1264                     code = afs_EvalFakeStat(&vcp, &fakestate, treq);
1265                 afs_DestroyReq(treq);
1266                 if ((tryEvalOnly && vcp->mvstat == AFS_MVSTAT_MTPT) || code) {
1267                     /* a mount point, not yet replaced by its directory */
1268                     goto bad_dentry;
1269                 }
1270             }
1271         } else if (vcp->mvstat == AFS_MVSTAT_ROOT && *dp->d_name.name != '/') {
1272             osi_Assert(vcp->mvid.parent != NULL);
1273         }
1274
1275 #ifdef notdef
1276         /* If the last looker changes, we should make sure the current
1277          * looker still has permission to examine this file.  This would
1278          * always require a crref() which would be "slow".
1279          */
1280         if (vcp->last_looker != treq.uid) {
1281             if (!afs_AccessOK(vcp, (vType(vcp) == VREG) ? PRSFS_READ : PRSFS_LOOKUP, &treq, CHECK_MODE_BITS)) {
1282                 goto bad_dentry;
1283             }
1284
1285             vcp->last_looker = treq.uid;
1286         }
1287 #endif
1288
1289         parent = dget_parent(dp);
1290         pvcp = VTOAFS(parent->d_inode);
1291         parent_dv = parent_vcache_dv(parent->d_inode, credp);
1292
1293         /* If the parent's DataVersion has changed or the vnode
1294          * is longer valid, we need to do a full lookup.  VerifyVCache
1295          * isn't enough since the vnode may have been renamed.
1296          */
1297
1298         if (parent_dv > dp->d_time || !(vcp->f.states & CStatd)) {
1299             struct vattr *vattr = NULL;
1300             int code;
1301             int lookup_good;
1302
1303             if (credp == NULL) {
1304                 credp = crref();
1305             }
1306             code = afs_lookup(pvcp, (char *)dp->d_name.name, &tvc, credp);
1307
1308             if (code) {
1309                 /* We couldn't perform the lookup, so we're not okay. */
1310                 lookup_good = 0;
1311
1312             } else if (tvc == vcp) {
1313                 /* We got back the same vcache, so we're good. */
1314                 lookup_good = 1;
1315
1316             } else if (tvc == VTOAFS(dp->d_inode)) {
1317                 /* We got back the same vcache, so we're good. This is
1318                  * different from the above case, because sometimes 'vcp' is
1319                  * not the same as the vcache for dp->d_inode, if 'vcp' was a
1320                  * mtpt and we evaluated it to a root dir. In rare cases,
1321                  * afs_lookup might not evalute the mtpt when we do, or vice
1322                  * versa, so the previous case will not succeed. But this is
1323                  * still 'correct', so make sure not to mark the dentry as
1324                  * invalid; it still points to the same thing! */
1325                 lookup_good = 1;
1326
1327             } else {
1328                 /* We got back a different file, so we're definitely not
1329                  * okay. */
1330                 lookup_good = 0;
1331             }
1332
1333             if (!lookup_good) {
1334                 dput(parent);
1335                 /* Force unhash; the name doesn't point to this file
1336                  * anymore. */
1337                 force_drop = 1;
1338                 if (code && code != ENOENT) {
1339                     /* ...except if we couldn't perform the actual lookup,
1340                      * we don't know if the name points to this file or not. */
1341                     force_drop = 0;
1342                 }
1343                 goto bad_dentry;
1344             }
1345
1346             code = afs_CreateAttr(&vattr);
1347             if (code) {
1348                 dput(parent);
1349                 goto bad_dentry;
1350             }
1351
1352             if (afs_getattr(vcp, vattr, credp)) {
1353                 dput(parent);
1354                 afs_DestroyAttr(vattr);
1355                 goto bad_dentry;
1356             }
1357
1358             vattr2inode(AFSTOV(vcp), vattr);
1359             dp->d_time = parent_dv;
1360
1361             afs_DestroyAttr(vattr);
1362         }
1363
1364         /* should we always update the attributes at this point? */
1365         /* unlikely--the vcache entry hasn't changed */
1366
1367         dput(parent);
1368
1369     } else {
1370
1371         /* 'dp' represents a cached negative lookup. */
1372
1373         parent = dget_parent(dp);
1374         pvcp = VTOAFS(parent->d_inode);
1375         parent_dv = parent_vcache_dv(parent->d_inode, credp);
1376
1377         if (parent_dv > dp->d_time || !(pvcp->f.states & CStatd)
1378             || afs_IsDynroot(pvcp)) {
1379             dput(parent);
1380             goto bad_dentry;
1381         }
1382
1383         dput(parent);
1384     }
1385
1386   good_dentry:
1387     valid = 1;
1388     goto done;
1389
1390   bad_dentry:
1391     valid = 0;
1392 #ifndef D_INVALIDATE_IS_VOID
1393     /* When (v3.18) d_invalidate was converted to void, it also started
1394      * being called automatically from revalidate, and automatically
1395      * handled:
1396      *  - shrink_dcache_parent
1397      *  - automatic detach of submounts
1398      *  - d_drop
1399      * Therefore, after that point, OpenAFS revalidate logic no longer needs
1400      * to do any of those things itself for invalid dentry structs.  We only need
1401      * to tell VFS it's invalid (by returning 0), and VFS will handle the rest.
1402      */
1403     if (have_submounts(dp))
1404         valid = 1;
1405 #endif
1406
1407   done:
1408     /* Clean up */
1409     if (tvc)
1410         afs_PutVCache(tvc);
1411     afs_PutFakeStat(&fakestate);
1412     AFS_GUNLOCK();
1413     if (credp)
1414         crfree(credp);
1415
1416 #ifndef D_INVALIDATE_IS_VOID
1417     if (!valid) {
1418         /*
1419          * If we had a negative lookup for the name we want to forcibly
1420          * unhash the dentry.
1421          * Otherwise use d_invalidate which will not unhash it if still in use.
1422          */
1423         if (force_drop) {
1424             shrink_dcache_parent(dp);
1425             d_drop(dp);
1426         } else
1427             d_invalidate(dp);
1428     }
1429 #endif
1430     return valid;
1431
1432 }
1433
1434 static void
1435 afs_dentry_iput(struct dentry *dp, struct inode *ip)
1436 {
1437     struct vcache *vcp = VTOAFS(ip);
1438
1439     AFS_GLOCK();
1440     if (!AFS_IS_DISCONNECTED || (vcp->f.states & CUnlinked)) {
1441         (void) afs_InactiveVCache(vcp, NULL);
1442     }
1443     AFS_GUNLOCK();
1444     afs_linux_clear_nfsfs_renamed(dp);
1445
1446     iput(ip);
1447 }
1448
1449 static int
1450 #if defined(DOP_D_DELETE_TAKES_CONST)
1451 afs_dentry_delete(const struct dentry *dp)
1452 #else
1453 afs_dentry_delete(struct dentry *dp)
1454 #endif
1455 {
1456     if (dp->d_inode && (VTOAFS(dp->d_inode)->f.states & CUnlinked))
1457         return 1;               /* bad inode? */
1458
1459     return 0;
1460 }
1461
1462 #ifdef STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT
1463 static struct vfsmount *
1464 afs_dentry_automount(afs_linux_path_t *path)
1465 {
1466     struct dentry *target;
1467
1468     /*
1469      * Avoid symlink resolution limits when resolving; we cannot contribute to
1470      * an infinite symlink loop.
1471      *
1472      * On newer kernels the field has moved to the private nameidata structure
1473      * so we can't adjust it here.  This may cause ELOOP when using a path with
1474      * 40 or more directories that are not already in the dentry cache.
1475      */
1476 #if defined(STRUCT_TASK_STRUCT_HAS_TOTAL_LINK_COUNT)
1477     current->total_link_count--;
1478 #endif
1479
1480     target = canonical_dentry(path->dentry->d_inode);
1481
1482     if (target == path->dentry) {
1483         dput(target);
1484         target = NULL;
1485     }
1486
1487     if (target) {
1488         dput(path->dentry);
1489         path->dentry = target;
1490
1491     } else {
1492         spin_lock(&path->dentry->d_lock);
1493         path->dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
1494         spin_unlock(&path->dentry->d_lock);
1495     }
1496
1497     return NULL;
1498 }
1499 #endif /* STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT */
1500
1501 struct dentry_operations afs_dentry_operations = {
1502   .d_revalidate =       afs_linux_dentry_revalidate,
1503   .d_delete =           afs_dentry_delete,
1504   .d_iput =             afs_dentry_iput,
1505 #ifdef STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT
1506   .d_automount =        afs_dentry_automount,
1507 #endif /* STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT */
1508 };
1509
1510 /**********************************************************************
1511  * AFS Linux inode operations
1512  **********************************************************************/
1513
1514 /* afs_linux_create
1515  *
1516  * Merely need to set enough of vattr to get us through the create. Note
1517  * that the higher level code (open_namei) will take care of any tuncation
1518  * explicitly. Exclusive open is also taken care of in open_namei.
1519  *
1520  * name is in kernel space at this point.
1521  */
1522 static int
1523 #if defined(IOP_CREATE_TAKES_BOOL)
1524 afs_linux_create(struct inode *dip, struct dentry *dp, umode_t mode,
1525                  bool excl)
1526 #elif defined(IOP_CREATE_TAKES_UMODE_T)
1527 afs_linux_create(struct inode *dip, struct dentry *dp, umode_t mode,
1528                  struct nameidata *nd)
1529 #elif defined(IOP_CREATE_TAKES_NAMEIDATA)
1530 afs_linux_create(struct inode *dip, struct dentry *dp, int mode,
1531                  struct nameidata *nd)
1532 #else
1533 afs_linux_create(struct inode *dip, struct dentry *dp, int mode)
1534 #endif
1535 {
1536     struct vattr *vattr = NULL;
1537     cred_t *credp = crref();
1538     const char *name = dp->d_name.name;
1539     struct vcache *vcp;
1540     int code;
1541
1542     AFS_GLOCK();
1543
1544     code = afs_CreateAttr(&vattr);
1545     if (code) {
1546         goto out;
1547     }
1548     vattr->va_mode = mode;
1549     vattr->va_type = mode & S_IFMT;
1550
1551     code = afs_create(VTOAFS(dip), (char *)name, vattr, NONEXCL, mode,
1552                       &vcp, credp);
1553
1554     if (!code) {
1555         struct inode *ip = AFSTOV(vcp);
1556
1557         afs_getattr(vcp, vattr, credp);
1558         afs_fill_inode(ip, vattr);
1559         insert_inode_hash(ip);
1560 #if !defined(STRUCT_SUPER_BLOCK_HAS_S_D_OP)
1561         dp->d_op = &afs_dentry_operations;
1562 #endif
1563         dp->d_time = parent_vcache_dv(dip, credp);
1564         d_instantiate(dp, ip);
1565     }
1566
1567     afs_DestroyAttr(vattr);
1568
1569 out:
1570     AFS_GUNLOCK();
1571
1572     crfree(credp);
1573     return afs_convert_code(code);
1574 }
1575
1576 /* afs_linux_lookup */
1577 static struct dentry *
1578 #if defined(IOP_LOOKUP_TAKES_UNSIGNED)
1579 afs_linux_lookup(struct inode *dip, struct dentry *dp,
1580                  unsigned flags)
1581 #elif defined(IOP_LOOKUP_TAKES_NAMEIDATA)
1582 afs_linux_lookup(struct inode *dip, struct dentry *dp,
1583                  struct nameidata *nd)
1584 #else
1585 afs_linux_lookup(struct inode *dip, struct dentry *dp)
1586 #endif
1587 {
1588     cred_t *credp = crref();
1589     struct vcache *vcp = NULL;
1590     const char *comp = dp->d_name.name;
1591     struct inode *ip = NULL;
1592     struct dentry *newdp = NULL;
1593     int code;
1594
1595     AFS_GLOCK();
1596
1597     code = afs_lookup(VTOAFS(dip), (char *)comp, &vcp, credp);
1598     if (code == ENOENT) {
1599         /* It's ok for the file to not be found. That's noted by the caller by
1600          * seeing that the dp->d_inode field is NULL (set by d_splice_alias or
1601          * d_add, below). */
1602         code = 0;
1603         osi_Assert(vcp == NULL);
1604     }
1605     if (code) {
1606         AFS_GUNLOCK();
1607         goto done;
1608     }
1609
1610     if (vcp) {
1611         struct vattr *vattr = NULL;
1612         struct vcache *parent_vc = VTOAFS(dip);
1613
1614         if (parent_vc == vcp) {
1615             /* This is possible if the parent dir is a mountpoint to a volume,
1616              * and the dir entry we looked up is a mountpoint to the same
1617              * volume. Linux cannot cope with this, so return an error instead
1618              * of risking a deadlock or panic. */
1619             afs_PutVCache(vcp);
1620             code = EDEADLK;
1621             AFS_GUNLOCK();
1622             goto done;
1623         }
1624
1625         code = afs_CreateAttr(&vattr);
1626         if (code) {
1627             afs_PutVCache(vcp);
1628             AFS_GUNLOCK();
1629             goto done;
1630         }
1631
1632         ip = AFSTOV(vcp);
1633         afs_getattr(vcp, vattr, credp);
1634         afs_fill_inode(ip, vattr);
1635         if (hlist_unhashed(&ip->i_hash))
1636             insert_inode_hash(ip);
1637
1638         afs_DestroyAttr(vattr);
1639     }
1640 #if !defined(STRUCT_SUPER_BLOCK_HAS_S_D_OP)
1641     dp->d_op = &afs_dentry_operations;
1642 #endif
1643     dp->d_time = parent_vcache_dv(dip, credp);
1644
1645     AFS_GUNLOCK();
1646
1647     if (ip && S_ISDIR(ip->i_mode)) {
1648         d_prune_aliases(ip);
1649
1650 #ifdef STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT
1651         /* Only needed if this is a volume root */
1652         if (vcp->mvstat == 2)
1653             ip->i_flags |= S_AUTOMOUNT;
1654 #endif
1655     }
1656     /*
1657      * Take an extra reference so the inode doesn't go away if
1658      * d_splice_alias drops our reference on error.
1659      */
1660     if (ip)
1661 #ifdef HAVE_LINUX_IHOLD
1662         ihold(ip);
1663 #else
1664         igrab(ip);
1665 #endif
1666
1667     dentry_race_lock();
1668     newdp = d_splice_alias(ip, dp);
1669     dentry_race_unlock();
1670
1671  done:
1672     crfree(credp);
1673
1674     if (IS_ERR(newdp)) {
1675         /* d_splice_alias can return an error (EIO) if there is an existing
1676          * connected directory alias for this dentry. Add our dentry manually
1677          * ourselves if this happens. */
1678         d_add(dp, ip);
1679
1680 #if defined(D_SPLICE_ALIAS_LEAK_ON_ERROR)
1681         /* Depending on the kernel version, d_splice_alias may or may not drop
1682          * the inode reference on error. If it didn't, do it here. */
1683         iput(ip);
1684 #endif
1685         return NULL;
1686     }
1687
1688     if (code) {
1689         if (ip)
1690             iput(ip);
1691         return ERR_PTR(afs_convert_code(code));
1692     }
1693
1694     iput(ip);
1695     return newdp;
1696 }
1697
1698 static int
1699 afs_linux_link(struct dentry *olddp, struct inode *dip, struct dentry *newdp)
1700 {
1701     int code;
1702     cred_t *credp = crref();
1703     const char *name = newdp->d_name.name;
1704     struct inode *oldip = olddp->d_inode;
1705
1706     /* If afs_link returned the vnode, we could instantiate the
1707      * dentry. Since it's not, we drop this one and do a new lookup.
1708      */
1709     d_drop(newdp);
1710
1711     AFS_GLOCK();
1712     code = afs_link(VTOAFS(oldip), VTOAFS(dip), (char *)name, credp);
1713
1714     AFS_GUNLOCK();
1715     crfree(credp);
1716     return afs_convert_code(code);
1717 }
1718
1719 /* We have to have a Linux specific sillyrename function, because we
1720  * also have to keep the dcache up to date when we're doing a silly
1721  * rename - so we don't want the generic vnodeops doing this behind our
1722  * back.
1723  */
1724
1725 static int
1726 afs_linux_sillyrename(struct inode *dir, struct dentry *dentry,
1727                       cred_t *credp)
1728 {
1729     struct vcache *tvc = VTOAFS(dentry->d_inode);
1730     struct dentry *__dp = NULL;
1731     char *__name = NULL;
1732     int code;
1733
1734     if (afs_linux_nfsfs_renamed(dentry))
1735         return EBUSY;
1736
1737     do {
1738         dput(__dp);
1739
1740         AFS_GLOCK();
1741         if (__name)
1742             osi_FreeSmallSpace(__name);
1743         __name = afs_newname();
1744         AFS_GUNLOCK();
1745
1746         __dp = lookup_one_len(__name, dentry->d_parent, strlen(__name));
1747
1748         if (IS_ERR(__dp)) {
1749             osi_FreeSmallSpace(__name);
1750             return EBUSY;
1751         }
1752     } while (__dp->d_inode != NULL);
1753
1754     AFS_GLOCK();
1755     code = afs_rename(VTOAFS(dir), (char *)dentry->d_name.name,
1756                       VTOAFS(dir), (char *)__dp->d_name.name,
1757                       credp);
1758     if (!code) {
1759         tvc->mvid.silly_name = __name;
1760         crhold(credp);
1761         if (tvc->uncred) {
1762             crfree(tvc->uncred);
1763         }
1764         tvc->uncred = credp;
1765         tvc->f.states |= CUnlinked;
1766         afs_linux_set_nfsfs_renamed(dentry);
1767
1768         __dp->d_time = 0;               /* force to revalidate */
1769         d_move(dentry, __dp);
1770     } else {
1771         osi_FreeSmallSpace(__name);
1772     }
1773     AFS_GUNLOCK();
1774
1775     dput(__dp);
1776
1777     return code;
1778 }
1779
1780
1781 static int
1782 afs_linux_unlink(struct inode *dip, struct dentry *dp)
1783 {
1784     int code = EBUSY;
1785     cred_t *credp = crref();
1786     const char *name = dp->d_name.name;
1787     struct vcache *tvc = VTOAFS(dp->d_inode);
1788
1789     if (VREFCOUNT(tvc) > 1 && tvc->opens > 0
1790                                 && !(tvc->f.states & CUnlinked)) {
1791
1792         code = afs_linux_sillyrename(dip, dp, credp);
1793     } else {
1794         AFS_GLOCK();
1795         code = afs_remove(VTOAFS(dip), (char *)name, credp);
1796         AFS_GUNLOCK();
1797         if (!code)
1798             d_drop(dp);
1799     }
1800
1801     crfree(credp);
1802     return afs_convert_code(code);
1803 }
1804
1805
1806 static int
1807 afs_linux_symlink(struct inode *dip, struct dentry *dp, const char *target)
1808 {
1809     int code;
1810     cred_t *credp = crref();
1811     struct vattr *vattr = NULL;
1812     const char *name = dp->d_name.name;
1813
1814     /* If afs_symlink returned the vnode, we could instantiate the
1815      * dentry. Since it's not, we drop this one and do a new lookup.
1816      */
1817     d_drop(dp);
1818
1819     AFS_GLOCK();
1820     code = afs_CreateAttr(&vattr);
1821     if (code) {
1822         goto out;
1823     }
1824
1825     code = afs_symlink(VTOAFS(dip), (char *)name, vattr, (char *)target, NULL,
1826                         credp);
1827     afs_DestroyAttr(vattr);
1828
1829 out:
1830     AFS_GUNLOCK();
1831     crfree(credp);
1832     return afs_convert_code(code);
1833 }
1834
1835 static int
1836 #if defined(IOP_MKDIR_TAKES_UMODE_T)
1837 afs_linux_mkdir(struct inode *dip, struct dentry *dp, umode_t mode)
1838 #else
1839 afs_linux_mkdir(struct inode *dip, struct dentry *dp, int mode)
1840 #endif
1841 {
1842     int code;
1843     cred_t *credp = crref();
1844     struct vcache *tvcp = NULL;
1845     struct vattr *vattr = NULL;
1846     const char *name = dp->d_name.name;
1847
1848     AFS_GLOCK();
1849     code = afs_CreateAttr(&vattr);
1850     if (code) {
1851         goto out;
1852     }
1853
1854     vattr->va_mask = ATTR_MODE;
1855     vattr->va_mode = mode;
1856
1857     code = afs_mkdir(VTOAFS(dip), (char *)name, vattr, &tvcp, credp);
1858
1859     if (tvcp) {
1860         struct inode *ip = AFSTOV(tvcp);
1861
1862         afs_getattr(tvcp, vattr, credp);
1863         afs_fill_inode(ip, vattr);
1864
1865 #if !defined(STRUCT_SUPER_BLOCK_HAS_S_D_OP)
1866         dp->d_op = &afs_dentry_operations;
1867 #endif
1868         dp->d_time = parent_vcache_dv(dip, credp);
1869         d_instantiate(dp, ip);
1870     }
1871     afs_DestroyAttr(vattr);
1872
1873 out:
1874     AFS_GUNLOCK();
1875
1876     crfree(credp);
1877     return afs_convert_code(code);
1878 }
1879
1880 static int
1881 afs_linux_rmdir(struct inode *dip, struct dentry *dp)
1882 {
1883     int code;
1884     cred_t *credp = crref();
1885     const char *name = dp->d_name.name;
1886
1887     /* locking kernel conflicts with glock? */
1888
1889     AFS_GLOCK();
1890     code = afs_rmdir(VTOAFS(dip), (char *)name, credp);
1891     AFS_GUNLOCK();
1892
1893     /* Linux likes to see ENOTEMPTY returned from an rmdir() syscall
1894      * that failed because a directory is not empty. So, we map
1895      * EEXIST to ENOTEMPTY on linux.
1896      */
1897     if (code == EEXIST) {
1898         code = ENOTEMPTY;
1899     }
1900
1901     if (!code) {
1902         d_drop(dp);
1903     }
1904
1905     crfree(credp);
1906     return afs_convert_code(code);
1907 }
1908
1909
1910 static int
1911 afs_linux_rename(struct inode *oldip, struct dentry *olddp,
1912                  struct inode *newip, struct dentry *newdp
1913 #ifdef HAVE_LINUX_INODE_OPERATIONS_RENAME_TAKES_FLAGS
1914                  , unsigned int flags
1915 #endif
1916                 )
1917 {
1918     int code;
1919     cred_t *credp = crref();
1920     const char *oldname = olddp->d_name.name;
1921     const char *newname = newdp->d_name.name;
1922     struct dentry *rehash = NULL;
1923
1924 #ifdef HAVE_LINUX_INODE_OPERATIONS_RENAME_TAKES_FLAGS
1925     if (flags)
1926         return -EINVAL;         /* no support for new flags yet */
1927 #endif
1928
1929     /* Prevent any new references during rename operation. */
1930
1931     if (!d_unhashed(newdp)) {
1932         d_drop(newdp);
1933         rehash = newdp;
1934     }
1935
1936     afs_maybe_shrink_dcache(olddp);
1937
1938     AFS_GLOCK();
1939     code = afs_rename(VTOAFS(oldip), (char *)oldname, VTOAFS(newip), (char *)newname, credp);
1940     AFS_GUNLOCK();
1941
1942     if (!code)
1943         olddp->d_time = 0;      /* force to revalidate */
1944
1945     if (rehash)
1946         d_rehash(rehash);
1947
1948     crfree(credp);
1949     return afs_convert_code(code);
1950 }
1951
1952
1953 /* afs_linux_ireadlink
1954  * Internal readlink which can return link contents to user or kernel space.
1955  * Note that the buffer is NOT supposed to be null-terminated.
1956  */
1957 static int
1958 afs_linux_ireadlink(struct inode *ip, char *target, int maxlen, uio_seg_t seg)
1959 {
1960     int code;
1961     cred_t *credp = crref();
1962     struct uio tuio;
1963     struct iovec iov;
1964
1965     memset(&tuio, 0, sizeof(tuio));
1966     memset(&iov, 0, sizeof(iov));
1967
1968     setup_uio(&tuio, &iov, target, (afs_offs_t) 0, maxlen, UIO_READ, seg);
1969     code = afs_readlink(VTOAFS(ip), &tuio, credp);
1970     crfree(credp);
1971
1972     if (!code)
1973         return maxlen - tuio.uio_resid;
1974     else
1975         return afs_convert_code(code);
1976 }
1977
1978 #if !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
1979 /* afs_linux_readlink
1980  * Fill target (which is in user space) with contents of symlink.
1981  */
1982 static int
1983 afs_linux_readlink(struct dentry *dp, char *target, int maxlen)
1984 {
1985     int code;
1986     struct inode *ip = dp->d_inode;
1987
1988     AFS_GLOCK();
1989     code = afs_linux_ireadlink(ip, target, maxlen, AFS_UIOUSER);
1990     AFS_GUNLOCK();
1991     return code;
1992 }
1993
1994
1995 /* afs_linux_follow_link
1996  * a file system dependent link following routine.
1997  */
1998 #if defined(HAVE_LINUX_INODE_OPERATIONS_FOLLOW_LINK_NO_NAMEIDATA)
1999 static const char *afs_linux_follow_link(struct dentry *dentry, void **link_data)
2000 #else
2001 static int afs_linux_follow_link(struct dentry *dentry, struct nameidata *nd)
2002 #endif
2003 {
2004     int code;
2005     char *name;
2006
2007     name = kmalloc(PATH_MAX, GFP_NOFS);
2008     if (!name) {
2009 #if defined(HAVE_LINUX_INODE_OPERATIONS_FOLLOW_LINK_NO_NAMEIDATA)
2010         return ERR_PTR(-EIO);
2011 #else
2012         return -EIO;
2013 #endif
2014     }
2015
2016     AFS_GLOCK();
2017     code = afs_linux_ireadlink(dentry->d_inode, name, PATH_MAX - 1, AFS_UIOSYS);
2018     AFS_GUNLOCK();
2019
2020     if (code < 0) {
2021 #if defined(HAVE_LINUX_INODE_OPERATIONS_FOLLOW_LINK_NO_NAMEIDATA)
2022         return ERR_PTR(code);
2023 #else
2024         return code;
2025 #endif
2026     }
2027
2028     name[code] = '\0';
2029 #if defined(HAVE_LINUX_INODE_OPERATIONS_FOLLOW_LINK_NO_NAMEIDATA)
2030     return *link_data = name;
2031 #else
2032     nd_set_link(nd, name);
2033     return 0;
2034 #endif
2035 }
2036
2037 #if defined(HAVE_LINUX_INODE_OPERATIONS_PUT_LINK_NO_NAMEIDATA)
2038 static void
2039 afs_linux_put_link(struct inode *inode, void *link_data)
2040 {
2041     char *name = link_data;
2042
2043     if (name && !IS_ERR(name))
2044         kfree(name);
2045 }
2046 #else
2047 static void
2048 afs_linux_put_link(struct dentry *dentry, struct nameidata *nd)
2049 {
2050     char *name = nd_get_link(nd);
2051
2052     if (name && !IS_ERR(name))
2053         kfree(name);
2054 }
2055 #endif /* HAVE_LINUX_INODE_OPERATIONS_PUT_LINK_NO_NAMEIDATA */
2056
2057 #endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
2058
2059 /* Populate a page by filling it from the cache file pointed at by cachefp
2060  * (which contains indicated chunk)
2061  * If task is NULL, the page copy occurs syncronously, and the routine
2062  * returns with page still locked. If task is non-NULL, then page copies
2063  * may occur in the background, and the page will be unlocked when it is
2064  * ready for use.
2065  */
2066 static int
2067 afs_linux_read_cache(struct file *cachefp, struct page *page,
2068                      int chunk, struct pagevec *lrupv,
2069                      struct afs_pagecopy_task *task) {
2070     loff_t offset = page_offset(page);
2071     struct inode *cacheinode = cachefp->f_dentry->d_inode;
2072     struct page *newpage, *cachepage;
2073     struct address_space *cachemapping;
2074     int pageindex;
2075     int code = 0;
2076
2077     cachemapping = cacheinode->i_mapping;
2078     newpage = NULL;
2079     cachepage = NULL;
2080
2081     /* If we're trying to read a page that's past the end of the disk
2082      * cache file, then just return a zeroed page */
2083     if (AFS_CHUNKOFFSET(offset) >= i_size_read(cacheinode)) {
2084         zero_user_segment(page, 0, PAGE_SIZE);
2085         SetPageUptodate(page);
2086         if (task)
2087             unlock_page(page);
2088         return 0;
2089     }
2090
2091     /* From our offset, we now need to work out which page in the disk
2092      * file it corresponds to. This will be fun ... */
2093     pageindex = (offset - AFS_CHUNKTOBASE(chunk)) >> PAGE_SHIFT;
2094
2095     while (cachepage == NULL) {
2096         cachepage = find_get_page(cachemapping, pageindex);
2097         if (!cachepage) {
2098             if (!newpage)
2099                 newpage = page_cache_alloc(cachemapping);
2100             if (!newpage) {
2101                 code = -ENOMEM;
2102                 goto out;
2103             }
2104
2105             code = add_to_page_cache(newpage, cachemapping,
2106                                      pageindex, GFP_KERNEL);
2107             if (code == 0) {
2108                 cachepage = newpage;
2109                 newpage = NULL;
2110
2111                 get_page(cachepage);
2112                 if (!pagevec_add(lrupv, cachepage))
2113                     __pagevec_lru_add_file(lrupv);
2114
2115             } else {
2116                 put_page(newpage);
2117                 newpage = NULL;
2118                 if (code != -EEXIST)
2119                     goto out;
2120             }
2121         } else {
2122             lock_page(cachepage);
2123         }
2124     }
2125
2126     if (!PageUptodate(cachepage)) {
2127         ClearPageError(cachepage);
2128         code = cachemapping->a_ops->readpage(NULL, cachepage);
2129         if (!code && !task) {
2130             wait_on_page_locked(cachepage);
2131         }
2132     } else {
2133         unlock_page(cachepage);
2134     }
2135
2136     if (!code) {
2137         if (PageUptodate(cachepage)) {
2138             copy_highpage(page, cachepage);
2139             flush_dcache_page(page);
2140             SetPageUptodate(page);
2141
2142             if (task)
2143                 unlock_page(page);
2144         } else if (task) {
2145             afs_pagecopy_queue_page(task, cachepage, page);
2146         } else {
2147             code = -EIO;
2148         }
2149     }
2150
2151     if (code && task) {
2152         unlock_page(page);
2153     }
2154
2155 out:
2156     if (cachepage)
2157         put_page(cachepage);
2158
2159     return code;
2160 }
2161
2162 static int inline
2163 afs_linux_readpage_fastpath(struct file *fp, struct page *pp, int *codep)
2164 {
2165     loff_t offset = page_offset(pp);
2166     struct inode *ip = FILE_INODE(fp);
2167     struct vcache *avc = VTOAFS(ip);
2168     struct dcache *tdc;
2169     struct file *cacheFp = NULL;
2170     int code;
2171     int dcLocked = 0;
2172     struct pagevec lrupv;
2173
2174     /* Not a UFS cache, don't do anything */
2175     if (cacheDiskType != AFS_FCACHE_TYPE_UFS)
2176         return 0;
2177
2178     /* No readpage (ex: tmpfs) , skip */
2179     if (cachefs_noreadpage)
2180         return 0;
2181
2182     /* Can't do anything if the vcache isn't statd , or if the read
2183      * crosses a chunk boundary.
2184      */
2185     if (!(avc->f.states & CStatd) ||
2186         AFS_CHUNK(offset) != AFS_CHUNK(offset + PAGE_SIZE)) {
2187         return 0;
2188     }
2189
2190     ObtainWriteLock(&avc->lock, 911);
2191
2192     /* XXX - See if hinting actually makes things faster !!! */
2193
2194     /* See if we have a suitable entry already cached */
2195     tdc = avc->dchint;
2196
2197     if (tdc) {
2198         /* We need to lock xdcache, then dcache, to handle situations where
2199          * the hint is on the free list. However, we can't safely do this
2200          * according to the locking hierarchy. So, use a non blocking lock.
2201          */
2202         ObtainReadLock(&afs_xdcache);
2203         dcLocked = ( 0 == NBObtainReadLock(&tdc->lock));
2204
2205         if (dcLocked && (tdc->index != NULLIDX)
2206             && !FidCmp(&tdc->f.fid, &avc->f.fid)
2207             && tdc->f.chunk == AFS_CHUNK(offset)
2208             && !(afs_indexFlags[tdc->index] & (IFFree | IFDiscarded))) {
2209             /* Bonus - the hint was correct */
2210             afs_RefDCache(tdc);
2211         } else {
2212             /* Only destroy the hint if its actually invalid, not if there's
2213              * just been a locking failure */
2214             if (dcLocked) {
2215                 ReleaseReadLock(&tdc->lock);
2216                 avc->dchint = NULL;
2217             }
2218
2219             tdc = NULL;
2220             dcLocked = 0;
2221         }
2222         ReleaseReadLock(&afs_xdcache);
2223     }
2224
2225     /* No hint, or hint is no longer valid - see if we can get something
2226      * directly from the dcache
2227      */
2228     if (!tdc)
2229         tdc = afs_FindDCache(avc, offset);
2230
2231     if (!tdc) {
2232         ReleaseWriteLock(&avc->lock);
2233         return 0;
2234     }
2235
2236     if (!dcLocked)
2237         ObtainReadLock(&tdc->lock);
2238
2239     /* Is the dcache we've been given currently up to date */
2240     if (!hsame(avc->f.m.DataVersion, tdc->f.versionNo) ||
2241         (tdc->dflags & DFFetching))
2242         goto out;
2243
2244     /* Update our hint for future abuse */
2245     avc->dchint = tdc;
2246
2247     /* Okay, so we've now got a cache file that is up to date */
2248
2249     /* XXX - I suspect we should be locking the inodes before we use them! */
2250     AFS_GUNLOCK();
2251     cacheFp = afs_linux_raw_open(&tdc->f.inode);
2252     if (!cacheFp->f_dentry->d_inode->i_mapping->a_ops->readpage) {
2253         cachefs_noreadpage = 1;
2254         AFS_GLOCK();
2255         goto out;
2256     }
2257 #if defined(PAGEVEC_INIT_COLD_ARG)
2258     pagevec_init(&lrupv, 0);
2259 #else
2260     pagevec_init(&lrupv);
2261 #endif
2262
2263     code = afs_linux_read_cache(cacheFp, pp, tdc->f.chunk, &lrupv, NULL);
2264
2265     if (pagevec_count(&lrupv))
2266        __pagevec_lru_add_file(&lrupv);
2267
2268     filp_close(cacheFp, NULL);
2269     AFS_GLOCK();
2270
2271     ReleaseReadLock(&tdc->lock);
2272     ReleaseWriteLock(&avc->lock);
2273     afs_PutDCache(tdc);
2274
2275     *codep = code;
2276     return 1;
2277
2278 out:
2279     ReleaseWriteLock(&avc->lock);
2280     ReleaseReadLock(&tdc->lock);
2281     afs_PutDCache(tdc);
2282     return 0;
2283 }
2284
2285 /* afs_linux_readpage
2286  *
2287  * This function is split into two, because prepare_write/begin_write
2288  * require a readpage call which doesn't unlock the resulting page upon
2289  * success.
2290  */
2291 static int
2292 afs_linux_fillpage(struct file *fp, struct page *pp)
2293 {
2294     afs_int32 code;
2295     char *address;
2296     struct uio *auio;
2297     struct iovec *iovecp;
2298     struct inode *ip = FILE_INODE(fp);
2299     afs_int32 cnt = page_count(pp);
2300     struct vcache *avc = VTOAFS(ip);
2301     afs_offs_t offset = page_offset(pp);
2302     cred_t *credp;
2303
2304     AFS_GLOCK();
2305     if (afs_linux_readpage_fastpath(fp, pp, &code)) {
2306         AFS_GUNLOCK();
2307         return code;
2308     }
2309     AFS_GUNLOCK();
2310
2311     credp = crref();
2312     address = kmap(pp);
2313     ClearPageError(pp);
2314
2315     auio = kmalloc(sizeof(struct uio), GFP_NOFS);
2316     iovecp = kmalloc(sizeof(struct iovec), GFP_NOFS);
2317
2318     setup_uio(auio, iovecp, (char *)address, offset, PAGE_SIZE, UIO_READ,
2319               AFS_UIOSYS);
2320
2321     AFS_GLOCK();
2322     AFS_DISCON_LOCK();
2323     afs_Trace4(afs_iclSetp, CM_TRACE_READPAGE, ICL_TYPE_POINTER, ip,
2324                ICL_TYPE_POINTER, pp, ICL_TYPE_INT32, cnt, ICL_TYPE_INT32,
2325                99999);  /* not a possible code value */
2326
2327     code = afs_rdwr(avc, auio, UIO_READ, 0, credp);
2328
2329     afs_Trace4(afs_iclSetp, CM_TRACE_READPAGE, ICL_TYPE_POINTER, ip,
2330                ICL_TYPE_POINTER, pp, ICL_TYPE_INT32, cnt, ICL_TYPE_INT32,
2331                code);
2332     AFS_DISCON_UNLOCK();
2333     AFS_GUNLOCK();
2334     if (!code) {
2335         /* XXX valid for no-cache also?  Check last bits of files... :)
2336          * Cognate code goes in afs_NoCacheFetchProc.  */
2337         if (auio->uio_resid)    /* zero remainder of page */
2338              memset((void *)(address + (PAGE_SIZE - auio->uio_resid)), 0,
2339                     auio->uio_resid);
2340
2341         flush_dcache_page(pp);
2342         SetPageUptodate(pp);
2343     } /* !code */
2344
2345     kunmap(pp);
2346
2347     kfree(auio);
2348     kfree(iovecp);
2349
2350     crfree(credp);
2351     return afs_convert_code(code);
2352 }
2353
2354 static int
2355 afs_linux_prefetch(struct file *fp, struct page *pp)
2356 {
2357     int code = 0;
2358     struct vcache *avc = VTOAFS(FILE_INODE(fp));
2359     afs_offs_t offset = page_offset(pp);
2360
2361     if (AFS_CHUNKOFFSET(offset) == 0) {
2362         struct dcache *tdc;
2363         struct vrequest *treq = NULL;
2364         cred_t *credp;
2365
2366         credp = crref();
2367         AFS_GLOCK();
2368         code = afs_CreateReq(&treq, credp);
2369         if (!code && !NBObtainWriteLock(&avc->lock, 534)) {
2370             tdc = afs_FindDCache(avc, offset);
2371             if (tdc) {
2372                 if (!(tdc->mflags & DFNextStarted))
2373                     afs_PrefetchChunk(avc, tdc, credp, treq);
2374                 afs_PutDCache(tdc);
2375             }
2376             ReleaseWriteLock(&avc->lock);
2377         }
2378         afs_DestroyReq(treq);
2379         AFS_GUNLOCK();
2380         crfree(credp);
2381     }
2382     return afs_convert_code(code);
2383
2384 }
2385
2386 static int
2387 afs_linux_bypass_readpages(struct file *fp, struct address_space *mapping,
2388                            struct list_head *page_list, unsigned num_pages)
2389 {
2390     afs_int32 page_ix;
2391     struct uio *auio;
2392     afs_offs_t offset;
2393     struct iovec* iovecp;
2394     struct nocache_read_request *ancr;
2395     struct page *pp;
2396     struct pagevec lrupv;
2397     afs_int32 code = 0;
2398
2399     cred_t *credp;
2400     struct inode *ip = FILE_INODE(fp);
2401     struct vcache *avc = VTOAFS(ip);
2402     afs_int32 base_index = 0;
2403     afs_int32 page_count = 0;
2404     afs_int32 isize;
2405
2406     /* background thread must free: iovecp, auio, ancr */
2407     iovecp = osi_Alloc(num_pages * sizeof(struct iovec));
2408
2409     auio = osi_Alloc(sizeof(struct uio));
2410     auio->uio_iov = iovecp;
2411     auio->uio_iovcnt = num_pages;
2412     auio->uio_flag = UIO_READ;
2413     auio->uio_seg = AFS_UIOSYS;
2414     auio->uio_resid = num_pages * PAGE_SIZE;
2415
2416     ancr = osi_Alloc(sizeof(struct nocache_read_request));
2417     ancr->auio = auio;
2418     ancr->offset = auio->uio_offset;
2419     ancr->length = auio->uio_resid;
2420
2421 #if defined(PAGEVEC_INIT_COLD_ARG)
2422     pagevec_init(&lrupv, 0);
2423 #else
2424     pagevec_init(&lrupv);
2425 #endif
2426
2427     for(page_ix = 0; page_ix < num_pages; ++page_ix) {
2428
2429         if(list_empty(page_list))
2430             break;
2431
2432         pp = list_entry(page_list->prev, struct page, lru);
2433         /* If we allocate a page and don't remove it from page_list,
2434          * the page cache gets upset. */
2435         list_del(&pp->lru);
2436         isize = (i_size_read(fp->f_mapping->host) - 1) >> PAGE_SHIFT;
2437         if(pp->index > isize) {
2438             if(PageLocked(pp))
2439                 unlock_page(pp);
2440             continue;
2441         }
2442
2443         if(page_ix == 0) {
2444             offset = page_offset(pp);
2445             ancr->offset = auio->uio_offset = offset;
2446             base_index = pp->index;
2447         }
2448         iovecp[page_ix].iov_len = PAGE_SIZE;
2449         code = add_to_page_cache(pp, mapping, pp->index, GFP_KERNEL);
2450         if(base_index != pp->index) {
2451             if(PageLocked(pp))
2452                  unlock_page(pp);
2453             put_page(pp);
2454             iovecp[page_ix].iov_base = (void *) 0;
2455             base_index++;
2456             ancr->length -= PAGE_SIZE;
2457             continue;
2458         }
2459         base_index++;
2460         if(code) {
2461             if(PageLocked(pp))
2462                 unlock_page(pp);
2463             put_page(pp);
2464             iovecp[page_ix].iov_base = (void *) 0;
2465         } else {
2466             page_count++;
2467             if(!PageLocked(pp)) {
2468                 lock_page(pp);
2469             }
2470
2471             /* increment page refcount--our original design assumed
2472              * that locking it would effectively pin it;  protect
2473              * ourselves from the possiblity that this assumption is
2474              * is faulty, at low cost (provided we do not fail to
2475              * do the corresponding decref on the other side) */
2476             get_page(pp);
2477
2478             /* save the page for background map */
2479             iovecp[page_ix].iov_base = (void*) pp;
2480
2481             /* and put it on the LRU cache */
2482             if (!pagevec_add(&lrupv, pp))
2483                 __pagevec_lru_add_file(&lrupv);
2484         }
2485     }
2486
2487     /* If there were useful pages in the page list, make sure all pages
2488      * are in the LRU cache, then schedule the read */
2489     if(page_count) {
2490         if (pagevec_count(&lrupv))
2491             __pagevec_lru_add_file(&lrupv);
2492         credp = crref();
2493         code = afs_ReadNoCache(avc, ancr, credp);
2494         crfree(credp);
2495     } else {
2496         /* If there is nothing for the background thread to handle,
2497          * it won't be freeing the things that we never gave it */
2498         osi_Free(iovecp, num_pages * sizeof(struct iovec));
2499         osi_Free(auio, sizeof(struct uio));
2500         osi_Free(ancr, sizeof(struct nocache_read_request));
2501     }
2502     /* we do not flush, release, or unmap pages--that will be
2503      * done for us by the background thread as each page comes in
2504      * from the fileserver */
2505     return afs_convert_code(code);
2506 }
2507
2508
2509 static int
2510 afs_linux_bypass_readpage(struct file *fp, struct page *pp)
2511 {
2512     cred_t *credp = NULL;
2513     struct uio *auio;
2514     struct iovec *iovecp;
2515     struct nocache_read_request *ancr;
2516     int code;
2517
2518     /*
2519      * Special case: if page is at or past end of file, just zero it and set
2520      * it as up to date.
2521      */
2522     if (page_offset(pp) >=  i_size_read(fp->f_mapping->host)) {
2523         zero_user_segment(pp, 0, PAGE_SIZE);
2524         SetPageUptodate(pp);
2525         unlock_page(pp);
2526         return 0;
2527     }
2528
2529     ClearPageError(pp);
2530
2531     /* receiver frees */
2532     auio = osi_Alloc(sizeof(struct uio));
2533     iovecp = osi_Alloc(sizeof(struct iovec));
2534
2535     /* address can be NULL, because we overwrite it with 'pp', below */
2536     setup_uio(auio, iovecp, NULL, page_offset(pp),
2537               PAGE_SIZE, UIO_READ, AFS_UIOSYS);
2538
2539     /* save the page for background map */
2540     get_page(pp); /* see above */
2541     auio->uio_iov->iov_base = (void*) pp;
2542     /* the background thread will free this */
2543     ancr = osi_Alloc(sizeof(struct nocache_read_request));
2544     ancr->auio = auio;
2545     ancr->offset = page_offset(pp);
2546     ancr->length = PAGE_SIZE;
2547
2548     credp = crref();
2549     code = afs_ReadNoCache(VTOAFS(FILE_INODE(fp)), ancr, credp);
2550     crfree(credp);
2551
2552     return afs_convert_code(code);
2553 }
2554
2555 static inline int
2556 afs_linux_can_bypass(struct inode *ip) {
2557
2558     switch(cache_bypass_strategy) {
2559         case NEVER_BYPASS_CACHE:
2560             return 0;
2561         case ALWAYS_BYPASS_CACHE:
2562             return 1;
2563         case LARGE_FILES_BYPASS_CACHE:
2564             if (i_size_read(ip) > cache_bypass_threshold)
2565                 return 1;
2566         default:
2567             return 0;
2568      }
2569 }
2570
2571 /* Check if a file is permitted to bypass the cache by policy, and modify
2572  * the cache bypass state recorded for that file */
2573
2574 static inline int
2575 afs_linux_bypass_check(struct inode *ip) {
2576     cred_t* credp;
2577
2578     int bypass = afs_linux_can_bypass(ip);
2579
2580     credp = crref();
2581     trydo_cache_transition(VTOAFS(ip), credp, bypass);
2582     crfree(credp);
2583
2584     return bypass;
2585 }
2586
2587
2588 static int
2589 afs_linux_readpage(struct file *fp, struct page *pp)
2590 {
2591     int code;
2592
2593     if (afs_linux_bypass_check(FILE_INODE(fp))) {
2594         code = afs_linux_bypass_readpage(fp, pp);
2595     } else {
2596         code = afs_linux_fillpage(fp, pp);
2597         if (!code)
2598             code = afs_linux_prefetch(fp, pp);
2599         unlock_page(pp);
2600     }
2601
2602     return code;
2603 }
2604
2605 /* Readpages reads a number of pages for a particular file. We use
2606  * this to optimise the reading, by limiting the number of times upon which
2607  * we have to lookup, lock and open vcaches and dcaches
2608  */
2609
2610 static int
2611 afs_linux_readpages(struct file *fp, struct address_space *mapping,
2612                     struct list_head *page_list, unsigned int num_pages)
2613 {
2614     struct inode *inode = mapping->host;
2615     struct vcache *avc = VTOAFS(inode);
2616     struct dcache *tdc;
2617     struct file *cacheFp = NULL;
2618     int code;
2619     unsigned int page_idx;
2620     loff_t offset;
2621     struct pagevec lrupv;
2622     struct afs_pagecopy_task *task;
2623
2624     if (afs_linux_bypass_check(inode))
2625         return afs_linux_bypass_readpages(fp, mapping, page_list, num_pages);
2626
2627     if (cacheDiskType == AFS_FCACHE_TYPE_MEM)
2628         return 0;
2629
2630     /* No readpage (ex: tmpfs) , skip */
2631     if (cachefs_noreadpage)
2632         return 0;
2633
2634     AFS_GLOCK();
2635     if ((code = afs_linux_VerifyVCache(avc, NULL))) {
2636         AFS_GUNLOCK();
2637         return code;
2638     }
2639
2640     ObtainWriteLock(&avc->lock, 912);
2641     AFS_GUNLOCK();
2642
2643     task = afs_pagecopy_init_task();
2644
2645     tdc = NULL;
2646 #if defined(PAGEVEC_INIT_COLD_ARG)
2647     pagevec_init(&lrupv, 0);
2648 #else
2649     pagevec_init(&lrupv);
2650 #endif
2651     for (page_idx = 0; page_idx < num_pages; page_idx++) {
2652         struct page *page = list_entry(page_list->prev, struct page, lru);
2653         list_del(&page->lru);
2654         offset = page_offset(page);
2655
2656         if (tdc && tdc->f.chunk != AFS_CHUNK(offset)) {
2657             AFS_GLOCK();
2658             ReleaseReadLock(&tdc->lock);
2659             afs_PutDCache(tdc);
2660             AFS_GUNLOCK();
2661             tdc = NULL;
2662             if (cacheFp)
2663                 filp_close(cacheFp, NULL);
2664         }
2665
2666         if (!tdc) {
2667             AFS_GLOCK();
2668             if ((tdc = afs_FindDCache(avc, offset))) {
2669                 ObtainReadLock(&tdc->lock);
2670                 if (!hsame(avc->f.m.DataVersion, tdc->f.versionNo) ||
2671                     (tdc->dflags & DFFetching)) {
2672                     ReleaseReadLock(&tdc->lock);
2673                     afs_PutDCache(tdc);
2674                     tdc = NULL;
2675                 }
2676             }
2677             AFS_GUNLOCK();
2678             if (tdc) {
2679                 cacheFp = afs_linux_raw_open(&tdc->f.inode);
2680                 if (!cacheFp->f_dentry->d_inode->i_mapping->a_ops->readpage) {
2681                     cachefs_noreadpage = 1;
2682                     goto out;
2683                 }
2684             }
2685         }
2686
2687         if (tdc && !add_to_page_cache(page, mapping, page->index,
2688                                       GFP_KERNEL)) {
2689             get_page(page);
2690             if (!pagevec_add(&lrupv, page))
2691                 __pagevec_lru_add_file(&lrupv);
2692
2693             afs_linux_read_cache(cacheFp, page, tdc->f.chunk, &lrupv, task);
2694         }
2695         put_page(page);
2696     }
2697     if (pagevec_count(&lrupv))
2698        __pagevec_lru_add_file(&lrupv);
2699
2700 out:
2701     if (tdc)
2702         filp_close(cacheFp, NULL);
2703
2704     afs_pagecopy_put_task(task);
2705
2706     AFS_GLOCK();
2707     if (tdc) {
2708         ReleaseReadLock(&tdc->lock);
2709         afs_PutDCache(tdc);
2710     }
2711
2712     ReleaseWriteLock(&avc->lock);
2713     AFS_GUNLOCK();
2714     return 0;
2715 }
2716
2717 /* Prepare an AFS vcache for writeback. Should be called with the vcache
2718  * locked */
2719 static inline int
2720 afs_linux_prepare_writeback(struct vcache *avc) {
2721     pid_t pid;
2722     struct pagewriter *pw;
2723
2724     pid = MyPidxx2Pid(MyPidxx);
2725     /* Prevent recursion into the writeback code */
2726     spin_lock(&avc->pagewriter_lock);
2727     list_for_each_entry(pw, &avc->pagewriters, link) {
2728         if (pw->writer == pid) {
2729             spin_unlock(&avc->pagewriter_lock);
2730             return AOP_WRITEPAGE_ACTIVATE;
2731         }
2732     }
2733     spin_unlock(&avc->pagewriter_lock);
2734
2735     /* Add ourselves to writer list */
2736     pw = osi_Alloc(sizeof(struct pagewriter));
2737     pw->writer = pid;
2738     spin_lock(&avc->pagewriter_lock);
2739     list_add_tail(&pw->link, &avc->pagewriters);
2740     spin_unlock(&avc->pagewriter_lock);
2741
2742     return 0;
2743 }
2744
2745 static inline int
2746 afs_linux_dopartialwrite(struct vcache *avc, cred_t *credp) {
2747     struct vrequest *treq = NULL;
2748     int code = 0;
2749
2750     if (!afs_CreateReq(&treq, credp)) {
2751         code = afs_DoPartialWrite(avc, treq);
2752         afs_DestroyReq(treq);
2753     }
2754
2755     return afs_convert_code(code);
2756 }
2757
2758 static inline void
2759 afs_linux_complete_writeback(struct vcache *avc) {
2760     struct pagewriter *pw, *store;
2761     pid_t pid;
2762     struct list_head tofree;
2763
2764     INIT_LIST_HEAD(&tofree);
2765     pid = MyPidxx2Pid(MyPidxx);
2766     /* Remove ourselves from writer list */
2767     spin_lock(&avc->pagewriter_lock);
2768     list_for_each_entry_safe(pw, store, &avc->pagewriters, link) {
2769         if (pw->writer == pid) {
2770             list_del(&pw->link);
2771             /* osi_Free may sleep so we need to defer it */
2772             list_add_tail(&pw->link, &tofree);
2773         }
2774     }
2775     spin_unlock(&avc->pagewriter_lock);
2776     list_for_each_entry_safe(pw, store, &tofree, link) {
2777         list_del(&pw->link);
2778         osi_Free(pw, sizeof(struct pagewriter));
2779     }
2780 }
2781
2782 /* Writeback a given page syncronously. Called with no AFS locks held */
2783 static int
2784 afs_linux_page_writeback(struct inode *ip, struct page *pp,
2785                          unsigned long offset, unsigned int count,
2786                          cred_t *credp)
2787 {
2788     struct vcache *vcp = VTOAFS(ip);
2789     char *buffer;
2790     afs_offs_t base;
2791     int code = 0;
2792     struct uio tuio;
2793     struct iovec iovec;
2794     int f_flags = 0;
2795
2796     memset(&tuio, 0, sizeof(tuio));
2797     memset(&iovec, 0, sizeof(iovec));
2798
2799     buffer = kmap(pp) + offset;
2800     base = page_offset(pp) + offset;
2801
2802     AFS_GLOCK();
2803     afs_Trace4(afs_iclSetp, CM_TRACE_UPDATEPAGE, ICL_TYPE_POINTER, vcp,
2804                ICL_TYPE_POINTER, pp, ICL_TYPE_INT32, page_count(pp),
2805                ICL_TYPE_INT32, 99999);
2806
2807     setup_uio(&tuio, &iovec, buffer, base, count, UIO_WRITE, AFS_UIOSYS);
2808
2809     code = afs_write(vcp, &tuio, f_flags, credp, 0);
2810
2811     i_size_write(ip, vcp->f.m.Length);
2812     ip->i_blocks = ((vcp->f.m.Length + 1023) >> 10) << 1;
2813
2814     code = code ? afs_convert_code(code) : count - tuio.uio_resid;
2815
2816     afs_Trace4(afs_iclSetp, CM_TRACE_UPDATEPAGE, ICL_TYPE_POINTER, vcp,
2817                ICL_TYPE_POINTER, pp, ICL_TYPE_INT32, page_count(pp),
2818                ICL_TYPE_INT32, code);
2819
2820     AFS_GUNLOCK();
2821     kunmap(pp);
2822
2823     return code;
2824 }
2825
2826 static int
2827 afs_linux_writepage_sync(struct inode *ip, struct page *pp,
2828                          unsigned long offset, unsigned int count)
2829 {
2830     int code;
2831     int code1 = 0;
2832     struct vcache *vcp = VTOAFS(ip);
2833     cred_t *credp;
2834
2835     /* Catch recursive writeback. This occurs if the kernel decides
2836      * writeback is required whilst we are writing to the cache, or
2837      * flushing to the server. When we're running syncronously (as
2838      * opposed to from writepage) we can't actually do anything about
2839      * this case - as we can't return AOP_WRITEPAGE_ACTIVATE to write()
2840      */
2841     AFS_GLOCK();
2842     ObtainWriteLock(&vcp->lock, 532);
2843     afs_linux_prepare_writeback(vcp);
2844     ReleaseWriteLock(&vcp->lock);
2845     AFS_GUNLOCK();
2846
2847     credp = crref();
2848     code = afs_linux_page_writeback(ip, pp, offset, count, credp);
2849
2850     AFS_GLOCK();
2851     ObtainWriteLock(&vcp->lock, 533);
2852     if (code > 0)
2853         code1 = afs_linux_dopartialwrite(vcp, credp);
2854     afs_linux_complete_writeback(vcp);
2855     ReleaseWriteLock(&vcp->lock);
2856     AFS_GUNLOCK();
2857     crfree(credp);
2858
2859     if (code1)
2860         return code1;
2861
2862     return code;
2863 }
2864
2865 static int
2866 #ifdef AOP_WRITEPAGE_TAKES_WRITEBACK_CONTROL
2867 afs_linux_writepage(struct page *pp, struct writeback_control *wbc)
2868 #else
2869 afs_linux_writepage(struct page *pp)
2870 #endif
2871 {
2872     struct address_space *mapping = pp->mapping;
2873     struct inode *inode;
2874     struct vcache *vcp;
2875     cred_t *credp;
2876     unsigned int to = PAGE_SIZE;
2877     loff_t isize;
2878     int code = 0;
2879     int code1 = 0;
2880
2881     get_page(pp);
2882
2883     inode = mapping->host;
2884     vcp = VTOAFS(inode);
2885     isize = i_size_read(inode);
2886
2887     /* Don't defeat an earlier truncate */
2888     if (page_offset(pp) > isize) {
2889         set_page_writeback(pp);
2890         unlock_page(pp);
2891         goto done;
2892     }
2893
2894     AFS_GLOCK();
2895     ObtainWriteLock(&vcp->lock, 537);
2896     code = afs_linux_prepare_writeback(vcp);
2897     if (code == AOP_WRITEPAGE_ACTIVATE) {
2898         /* WRITEPAGE_ACTIVATE is the only return value that permits us
2899          * to return with the page still locked */
2900         ReleaseWriteLock(&vcp->lock);
2901         AFS_GUNLOCK();
2902         return code;
2903     }
2904
2905     /* Grab the creds structure currently held in the vnode, and
2906      * get a reference to it, in case it goes away ... */
2907     credp = vcp->cred;
2908     if (credp)
2909         crhold(credp);
2910     else
2911         credp = crref();
2912     ReleaseWriteLock(&vcp->lock);
2913     AFS_GUNLOCK();
2914
2915     set_page_writeback(pp);
2916
2917     SetPageUptodate(pp);
2918
2919     /* We can unlock the page here, because it's protected by the
2920      * page_writeback flag. This should make us less vulnerable to
2921      * deadlocking in afs_write and afs_DoPartialWrite
2922      */
2923     unlock_page(pp);
2924
2925     /* If this is the final page, then just write the number of bytes that
2926      * are actually in it */
2927     if ((isize - page_offset(pp)) < to )
2928         to = isize - page_offset(pp);
2929
2930     code = afs_linux_page_writeback(inode, pp, 0, to, credp);
2931
2932     AFS_GLOCK();
2933     ObtainWriteLock(&vcp->lock, 538);
2934
2935     /* As much as we might like to ignore a file server error here,
2936      * and just try again when we close(), unfortunately StoreAllSegments
2937      * will invalidate our chunks if the server returns a permanent error,
2938      * so we need to at least try and get that error back to the user
2939      */
2940     if (code == to)
2941         code1 = afs_linux_dopartialwrite(vcp, credp);
2942
2943     afs_linux_complete_writeback(vcp);
2944     ReleaseWriteLock(&vcp->lock);
2945     crfree(credp);
2946     AFS_GUNLOCK();
2947
2948 done:
2949     end_page_writeback(pp);
2950     put_page(pp);
2951
2952     if (code1)
2953         return code1;
2954
2955     if (code == to)
2956         return 0;
2957
2958     return code;
2959 }
2960
2961 /* afs_linux_permission
2962  * Check access rights - returns error if can't check or permission denied.
2963  */
2964 static int
2965 #if defined(IOP_PERMISSION_TAKES_FLAGS)
2966 afs_linux_permission(struct inode *ip, int mode, unsigned int flags)
2967 #elif defined(IOP_PERMISSION_TAKES_NAMEIDATA)
2968 afs_linux_permission(struct inode *ip, int mode, struct nameidata *nd)
2969 #else
2970 afs_linux_permission(struct inode *ip, int mode)
2971 #endif
2972 {
2973     int code;
2974     cred_t *credp;
2975     int tmp = 0;
2976
2977     /* Check for RCU path walking */
2978 #if defined(IOP_PERMISSION_TAKES_FLAGS)
2979     if (flags & IPERM_FLAG_RCU)
2980        return -ECHILD;
2981 #elif defined(MAY_NOT_BLOCK)
2982     if (mode & MAY_NOT_BLOCK)
2983        return -ECHILD;
2984 #endif
2985
2986     credp = crref();
2987     AFS_GLOCK();
2988     if (mode & MAY_EXEC)
2989         tmp |= VEXEC;
2990     if (mode & MAY_READ)
2991         tmp |= VREAD;
2992     if (mode & MAY_WRITE)
2993         tmp |= VWRITE;
2994     code = afs_access(VTOAFS(ip), tmp, credp);
2995
2996     AFS_GUNLOCK();
2997     crfree(credp);
2998     return afs_convert_code(code);
2999 }
3000
3001 static int
3002 afs_linux_commit_write(struct file *file, struct page *page, unsigned offset,
3003                        unsigned to)
3004 {
3005     int code;
3006     struct inode *inode = FILE_INODE(file);
3007     loff_t pagebase = page_offset(page);
3008
3009     if (i_size_read(inode) < (pagebase + offset))
3010         i_size_write(inode, pagebase + offset);
3011
3012     if (PageChecked(page)) {
3013         SetPageUptodate(page);
3014         ClearPageChecked(page);
3015     }
3016
3017     code = afs_linux_writepage_sync(inode, page, offset, to - offset);
3018
3019     return code;
3020 }
3021
3022 static int
3023 afs_linux_prepare_write(struct file *file, struct page *page, unsigned from,
3024                         unsigned to)
3025 {
3026
3027     /* http://kerneltrap.org/node/4941 details the expected behaviour of
3028      * prepare_write. Essentially, if the page exists within the file,
3029      * and is not being fully written, then we should populate it.
3030      */
3031
3032     if (!PageUptodate(page)) {
3033         loff_t pagebase = page_offset(page);
3034         loff_t isize = i_size_read(page->mapping->host);
3035
3036         /* Is the location we are writing to beyond the end of the file? */
3037         if (pagebase >= isize ||
3038             ((from == 0) && (pagebase + to) >= isize)) {
3039             zero_user_segments(page, 0, from, to, PAGE_SIZE);
3040             SetPageChecked(page);
3041         /* Are we we writing a full page */
3042         } else if (from == 0 && to == PAGE_SIZE) {
3043             SetPageChecked(page);
3044         /* Is the page readable, if it's wronly, we don't care, because we're
3045          * not actually going to read from it ... */
3046         } else if ((file->f_flags && O_ACCMODE) != O_WRONLY) {
3047             /* We don't care if fillpage fails, because if it does the page
3048              * won't be marked as up to date
3049              */
3050             afs_linux_fillpage(file, page);
3051         }
3052     }
3053     return 0;
3054 }
3055
3056 #if defined(STRUCT_ADDRESS_SPACE_OPERATIONS_HAS_WRITE_BEGIN)
3057 static int
3058 afs_linux_write_end(struct file *file, struct address_space *mapping,
3059                                 loff_t pos, unsigned len, unsigned copied,
3060                                 struct page *page, void *fsdata)
3061 {
3062     int code;
3063     unsigned int from = pos & (PAGE_SIZE - 1);
3064
3065     code = afs_linux_commit_write(file, page, from, from + copied);
3066
3067     unlock_page(page);
3068     put_page(page);
3069     return code;
3070 }
3071
3072 static int
3073 afs_linux_write_begin(struct file *file, struct address_space *mapping,
3074                                 loff_t pos, unsigned len, unsigned flags,
3075                                 struct page **pagep, void **fsdata)
3076 {
3077     struct page *page;
3078     pgoff_t index = pos >> PAGE_SHIFT;
3079     unsigned int from = pos & (PAGE_SIZE - 1);
3080     int code;
3081
3082     page = grab_cache_page_write_begin(mapping, index, flags);
3083     *pagep = page;
3084
3085     code = afs_linux_prepare_write(file, page, from, from + len);
3086     if (code) {
3087         unlock_page(page);
3088         put_page(page);
3089     }
3090
3091     return code;
3092 }
3093 #endif
3094
3095 #ifndef STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT
3096 static void *
3097 afs_linux_dir_follow_link(struct dentry *dentry, struct nameidata *nd)
3098 {
3099     struct dentry **dpp;
3100     struct dentry *target;
3101
3102     if (current->total_link_count > 0) {
3103         /* avoid symlink resolution limits when resolving; we cannot contribute to
3104          * an infinite symlink loop */
3105         /* only do this for follow_link when total_link_count is positive to be
3106          * on the safe side; there is at least one code path in the Linux
3107          * kernel where it seems like it may be possible to get here without
3108          * total_link_count getting incremented. it is not clear on how that
3109          * path is actually reached, but guard against it just to be safe */
3110         current->total_link_count--;
3111     }
3112
3113     target = canonical_dentry(dentry->d_inode);
3114
3115 # ifdef STRUCT_NAMEIDATA_HAS_PATH
3116     dpp = &nd->path.dentry;
3117 # else
3118     dpp = &nd->dentry;
3119 # endif
3120
3121     dput(*dpp);
3122
3123     if (target) {
3124         *dpp = target;
3125     } else {
3126         *dpp = dget(dentry);
3127     }
3128
3129     nd->last_type = LAST_BIND;
3130
3131     return NULL;
3132 }
3133 #endif /* !STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT */
3134
3135
3136 static struct inode_operations afs_file_iops = {
3137   .permission =         afs_linux_permission,
3138   .getattr =            afs_linux_getattr,
3139   .setattr =            afs_notify_change,
3140 };
3141
3142 static struct address_space_operations afs_file_aops = {
3143   .readpage =           afs_linux_readpage,
3144   .readpages =          afs_linux_readpages,
3145   .writepage =          afs_linux_writepage,
3146 #if defined (STRUCT_ADDRESS_SPACE_OPERATIONS_HAS_WRITE_BEGIN)
3147   .write_begin =        afs_linux_write_begin,
3148   .write_end =          afs_linux_write_end,
3149 #else
3150   .commit_write =       afs_linux_commit_write,
3151   .prepare_write =      afs_linux_prepare_write,
3152 #endif
3153 };
3154
3155
3156 /* Separate ops vector for directories. Linux 2.2 tests type of inode
3157  * by what sort of operation is allowed.....
3158  */
3159
3160 static struct inode_operations afs_dir_iops = {
3161   .setattr =            afs_notify_change,
3162   .create =             afs_linux_create,
3163   .lookup =             afs_linux_lookup,
3164   .link =               afs_linux_link,
3165   .unlink =             afs_linux_unlink,
3166   .symlink =            afs_linux_symlink,
3167   .mkdir =              afs_linux_mkdir,
3168   .rmdir =              afs_linux_rmdir,
3169   .rename =             afs_linux_rename,
3170   .getattr =            afs_linux_getattr,
3171   .permission =         afs_linux_permission,
3172 #ifndef STRUCT_DENTRY_OPERATIONS_HAS_D_AUTOMOUNT
3173   .follow_link =        afs_linux_dir_follow_link,
3174 #endif
3175 };
3176
3177 /* We really need a separate symlink set of ops, since do_follow_link()
3178  * determines if it _is_ a link by checking if the follow_link op is set.
3179  */
3180 #if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
3181 static int
3182 afs_symlink_filler(struct file *file, struct page *page)
3183 {
3184     struct inode *ip = (struct inode *)page->mapping->host;
3185     char *p = (char *)kmap(page);
3186     int code;
3187
3188     AFS_GLOCK();
3189     code = afs_linux_ireadlink(ip, p, PAGE_SIZE, AFS_UIOSYS);
3190     AFS_GUNLOCK();
3191
3192     if (code < 0)
3193         goto fail;
3194     p[code] = '\0';             /* null terminate? */
3195
3196     SetPageUptodate(page);
3197     kunmap(page);
3198     unlock_page(page);
3199     return 0;
3200
3201   fail:
3202     SetPageError(page);
3203     kunmap(page);
3204     unlock_page(page);
3205     return code;
3206 }
3207
3208 static struct address_space_operations afs_symlink_aops = {
3209   .readpage =   afs_symlink_filler
3210 };
3211 #endif  /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
3212
3213 static struct inode_operations afs_symlink_iops = {
3214 #if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
3215   .readlink =           page_readlink,
3216 # if defined(HAVE_LINUX_PAGE_GET_LINK)
3217   .get_link =           page_get_link,
3218 # elif defined(HAVE_LINUX_PAGE_FOLLOW_LINK)
3219   .follow_link =        page_follow_link,
3220 # else
3221   .follow_link =        page_follow_link_light,
3222   .put_link =           page_put_link,
3223 # endif
3224 #else /* !defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE) */
3225   .readlink =           afs_linux_readlink,
3226   .follow_link =        afs_linux_follow_link,
3227   .put_link =           afs_linux_put_link,
3228 #endif /* USABLE_KERNEL_PAGE_SYMLINK_CACHE */
3229   .setattr =            afs_notify_change,
3230 };
3231
3232 void
3233 afs_fill_inode(struct inode *ip, struct vattr *vattr)
3234 {
3235     if (vattr)
3236         vattr2inode(ip, vattr);
3237
3238 #ifdef STRUCT_ADDRESS_SPACE_HAS_BACKING_DEV_INFO
3239     ip->i_mapping->backing_dev_info = afs_backing_dev_info;
3240 #endif
3241 /* Reset ops if symlink or directory. */
3242     if (S_ISREG(ip->i_mode)) {
3243         ip->i_op = &afs_file_iops;
3244         ip->i_fop = &afs_file_fops;
3245         ip->i_data.a_ops = &afs_file_aops;
3246
3247     } else if (S_ISDIR(ip->i_mode)) {
3248         ip->i_op = &afs_dir_iops;
3249         ip->i_fop = &afs_dir_fops;
3250
3251     } else if (S_ISLNK(ip->i_mode)) {
3252         ip->i_op = &afs_symlink_iops;
3253 #if defined(HAVE_LINUX_INODE_NOHIGHMEM)
3254         inode_nohighmem(ip);
3255 #endif
3256 #if defined(USABLE_KERNEL_PAGE_SYMLINK_CACHE)
3257         ip->i_data.a_ops = &afs_symlink_aops;
3258         ip->i_mapping = &ip->i_data;
3259 #endif
3260     }
3261
3262 }