2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
12 #include "../afs/param.h"
13 #include "../afs/sysincludes.h" /* Standard vendor system headers */
14 #include "../afs/afsincludes.h" /* Afs-based standard headers */
15 #include "../afs/afs_stats.h" /* statistics stuff */
19 #include <sys/mount.h>
20 #include <sys/vnode.h>
21 #include <sys/pathname.h>
23 extern struct vfsops Afs_vfsops;
24 extern int afs_hp_strategy();
25 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
26 extern int afs_pagein();
27 extern int afs_pageout();
28 extern int afs_ioctl();
29 extern int afs_prealloc();
30 extern int afs_mapdbd();
31 extern int afs_mmap();
32 extern int afs_cachelimit();
33 extern int afs_vm_checkpage();
34 extern int afs_vm_fscontiguous();
35 extern int afs_vm_stopio();
36 extern int afs_read_ahead();
37 extern int afs_unmap();
38 extern int afs_release();
39 extern int afs_swapfs_len();
40 extern int afs_readdir2();
41 extern int afs_readdir();
42 extern int afs_readdir3();
43 extern int afs_pathconf();
44 extern int afs_close();
46 #define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
50 * Copy an mbuf to the contiguous area pointed to by cp.
51 * Skip <off> bytes and copy <len> bytes.
52 * Returns the number of bytes not transferred.
53 * The mbuf is NOT changed.
56 m_cpytoc(m, off, len, cp)
57 register struct mbuf *m;
58 register int off, len;
63 if (m == NULL || off < 0 || len < 0 || cp == NULL)
64 osi_Panic("m_cpytoc");
66 if (m->m_len <= off) {
75 ml = MIN(len, m->m_len - off);
76 bcopy(mtod(m, caddr_t)+off, cp, (u_int)ml);
83 bcopy(mtod(m, caddr_t), cp, (u_int)ml);
93 * Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
94 * totally new. This came about because HP-UX has lockf() implemented as
95 * a system call while Sun has it implemented as a library (apparently).
96 * To handle this, we have to translate the lockf() request into an
97 * fcntl() looking request, and then translate the results back if necessary.
98 * we call afs_lockctl() directly .
100 afs_lockf( vp, flag, len, cred, fp, LB, UB )
103 struct AFS_UCRED *cred;
107 /*for now, just pretend it works*/
108 struct k_flock flock;
112 * Create a flock structure and translate the lockf request
113 * into an appropriate looking fcntl() type request for afs_lockctl()
117 flock.l_start = fp->f_offset;
118 /* convert negative lengths to positive */
119 if (flock.l_len < 0 ) {
120 flock.l_start += flock.l_len;
121 flock.l_len = -(flock.l_len);
124 * Adjust values to look like fcntl() requests.
125 * All locks are write locks, only F_LOCK requests
126 * are blocking. F_TEST has to be translated into
127 * a get lock and then back again.
129 flock.l_type = F_WRLCK;
133 flock.l_type = F_UNLCK;
142 u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
144 return(u.u_error); /* some other error code */
147 * if request is F_TEST, and GETLK changed
148 * the lock type to ULOCK, then return 0, else
149 * set errno to EACCESS and return.
151 if (flag == F_TEST && flock.l_type != F_UNLCK) {
159 #include "../machine/vmparam.h" /* For KERNELSPACE */
160 #include "../h/debug.h"
161 #include "../h/types.h"
162 #include "../h/param.h"
163 #include "../h/vmmac.h"
164 #include "../h/time.h"
165 #include "../ufs/inode.h"
166 #include "../ufs/fs.h"
167 #include "../h/dbd.h"
168 #include "../h/vfd.h"
169 #include "../h/region.h"
170 #include "../h/pregion.h"
171 #include "../h/vmmeter.h"
172 #include "../h/user.h"
173 #include "../h/sysinfo.h"
174 #include "../h/pfdat.h"
175 #include "../h/tuneable.h"
176 #include "../h/buf.h"
177 #include "../netinet/in.h"
178 #include "../rpc/types.h"
179 #include "../rpc/auth.h"
180 #include "../rpc/clnt.h"
181 #include "../rpc/xdr.h"
183 /* a freelist of one */
184 struct buf *afs_bread_freebp = 0;
187 * Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
188 * Thus we can use fake bufs (ie not from the real buffer pool).
190 afs_bread(vp, lbn, bpp)
195 int offset, fsbsize, error;
200 AFS_STATCNT(afs_bread);
201 fsbsize = vp->v_vfsp->vfs_bsize;
202 offset = lbn * fsbsize;
203 if (afs_bread_freebp) {
204 bp = afs_bread_freebp;
205 afs_bread_freebp = 0;
207 bp = (struct buf *) AFS_KALLOC(sizeof(*bp));
208 bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
211 iov.iov_base = bp->b_un.b_addr;
212 iov.iov_len = fsbsize;
213 uio.afsio_iov = &iov;
214 uio.afsio_iovcnt = 1;
215 uio.afsio_seg = AFS_UIOSYS;
216 uio.afsio_offset = offset;
217 uio.afsio_resid = fsbsize;
221 error = afs_read((struct vcache *)vp, &uio, p_cred(u.u_procp),
224 afs_bread_freebp = bp;
228 afs_bread_freebp = bp;
230 *(struct buf **)&bp->b_vp = bp; /* mark as fake */
240 AFS_STATCNT(afs_brelse);
242 if ((struct buf *)bp->b_vp != bp) { /* not fake */
243 ufs_brelse(bp->b_vp, bp);
244 } else if (afs_bread_freebp) {
245 AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
246 AFS_KFREE(bp, sizeof(*bp));
248 afs_bread_freebp = bp;
253 afs_bmap(avc, abn, anvp, anbn)
254 register struct vcache *avc;
255 afs_int32 abn, *anbn;
256 struct vcache **anvp; {
257 AFS_STATCNT(afs_bmap);
261 *anbn = abn * (8192 / DEV_BSIZE); /* in 512 byte units */
265 afs_inactive(avc, acred)
266 register struct vcache *avc;
267 struct AFS_UCRED *acred;
269 struct vnode *vp = (struct vnode *)avc;
272 if (afs_shuttingdown) return ;
275 * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
276 * v_count 1 on last reference!
278 MP_H_SPINLOCK_USAV(vn_h_sl_pool,vp,&sv_lock,&context);
279 if (avc->vrefCount < 1) osi_Panic("afs_inactive : v_count < 1\n");
282 * If more than 1 don't unmap the vnode but do decrement the ref count
285 if (vp->v_count > 0) {
286 MP_SPINUNLOCK_USAV(sv_lock,context);
289 MP_SPINUNLOCK_USAV(sv_lock,context);
290 afs_InactiveVCache(avc, acred);
296 mp_afs_open(register struct vnode **avcp, int aflags, struct AFS_UCRED *acred)
301 code = afs_open(avcp, aflags, acred);
307 mp_afs_close(register struct vnode *avcp, int aflags, struct AFS_UCRED *acred)
312 code = afs_close(avcp, aflags, acred);
318 mp_afs_rdwr(register struct vnode *avcp, struct uio *uio, enum uio_rw arw, int aio, struct AFS_UCRED *acred)
324 save_resid = uio->uio_resid;
325 code = afs_rdwr(avcp, uio, arw, aio, acred);
326 if (arw == UIO_WRITE && code == ENOSPC) {
327 /* HP clears code if any data written. */
328 uio->uio_resid = save_resid;
335 mp_afs_getattr(register struct vnode *avcp, struct vattr *attrs, struct AFS_UCRED *acred, enum vsync unused1)
340 code = afs_getattr(avcp, attrs, acred);
346 mp_afs_setattr(register struct vnode *avcp, register struct vattr *attrs, struct AFS_UCRED *acred, int unused1)
351 code = afs_setattr(avcp, attrs, acred);
357 mp_afs_access(register struct vnode *avcp, int mode, struct AFS_UCRED *acred)
362 code = afs_access(avcp, mode, acred);
368 mp_afs_lookup(register struct vnode *adp, char *aname, register struct vnode **avcp, struct AFS_UCRED *acred, struct vnode *unused1)
373 code = afs_lookup(adp, aname, avcp, acred);
379 mp_afs_create(register struct vnode *adp, char *aname, struct vattr *attrs, enum vcexcl aexcl, int amode, struct vnode **avcp, struct AFS_UCRED *acred)
384 code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
391 mp_afs_remove(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
396 code = afs_remove(adp, aname, acred);
402 mp_afs_link(register struct vnode *avc, register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
407 code = afs_link(avc, adp, aname, acred);
413 mp_afs_rename(register struct vnode *aodp, char *aname1, register struct vnode *andp, char *aname2, struct AFS_UCRED *acred)
418 code = afs_rename(aodp, aname1, andp, aname2, acred);
424 mp_afs_mkdir(register struct vnode *adp, char *aname, struct vattr *attrs, register struct vnode **avcp, struct AFS_UCRED *acred)
429 code = afs_mkdir(adp, aname, attrs, avcp, acred);
436 mp_afs_rmdir(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
441 code = afs_rmdir(adp, aname, acred);
448 mp_afs_readdir(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
453 code = afs_readdir(avc, auio, acred);
459 mp_afs_symlink(register struct vnode *adp, char *aname, struct vattr *attrs, char *atargetName, struct AFS_UCRED *acred)
464 code = afs_symlink(adp, aname, attrs, atargetName, acred);
471 mp_afs_readlink(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
476 code = afs_readlink(avc, auio, acred);
482 mp_afs_fsync(register struct vnode *avc, struct AFS_UCRED *acred, int unused1)
487 code = afs_fsync(avc, acred);
493 mp_afs_bread(register struct vnode *avc, daddr_t lbn, struct buf **bpp, struct vattr *unused1, struct ucred *unused2)
498 code = afs_bread(avc, lbn, bpp);
504 mp_afs_brelse(register struct vnode *avc, struct buf *bp)
509 code = afs_brelse(avc, bp);
516 mp_afs_inactive(register struct vnode *avc, struct AFS_UCRED *acred)
521 code = afs_inactive(avc, acred);
527 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd, struct AFS_UCRED *acred, struct file *unused1, off_t unused2, off_t unused3)
532 code = afs_lockctl(avc, af, cmd, acred);
538 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
543 code = afs_fid(avc, fidpp);
548 mp_afs_readdir2(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
553 code = afs_readdir2(avc, auio, acred);
559 struct vnodeops Afs_vnodeops = {
582 #if !defined(AFS_NONFSTRANS)
583 /* on HPUX102 the nfs translator calls afs_bread but does
584 * not call afs_brelse. Hence we see a memory leak. If the
585 * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
586 * the same data : this is the path we follow now. */
593 afs_badop, /* pathsend */
594 afs_noop, /* setacl */
595 afs_noop, /* getacl */
599 afs_lockf, /* lockf */
622 struct vnodeops *afs_ops = &Afs_vnodeops;
624 /* vnode file operations, and our own */
626 extern int vno_ioctl();
627 extern int vno_select();
628 extern int afs_closex();
629 extern int vno_close();
630 struct fileops afs_fileops = {
637 #define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
640 ********************************************************************
642 **** afspgin_setup_io_ranges ()
643 **** similar to: nfspgin_setup_io_ranges ()
644 ********************************************************************
647 afspgin_setup_io_ranges(
653 pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
654 pgcnt_t minpage; /* first page to bring in */
655 pgcnt_t maxpage; /* one past last page to bring in */
657 pgcnt_t multio_maxpage;
660 expnd_flags_t up_reason, down_reason;
667 VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
670 * We do not go past the end of the current pregion nor past the end
671 * of the current file.
674 maxpage = startindex + (bpages - (startindex+file_offset) % bpages);
675 maxpage = vm_reset_maxpage(vm_info, maxpage);
676 maxpage = MIN(maxpage, (pgcnt_t)btorp(isize) - file_offset);
677 maxpage = MIN(maxpage, startindex + maxpagein);
678 multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
683 VASSERT(maxpage >= startindex);
686 * Expanding the fault will create calls to FINDENTRY() for new
687 * pages, which will obsolete "dbd", so copy what it points to
688 * and clear it to prevent using stale data.
691 prp = VM_PRP(vm_info);
692 dbdtype = DBD_TYPE(vm_info);
693 start_blk = DBD_DATA(vm_info);
696 VASSERT(dbdtype != DBD_NONE);
698 if (max_num_io == 1) {
700 * We need to set up one I/O: First we attempt to expand the
701 * I/O forward. Then we expand the I/O backwards.
703 count = expand_faultin_up(vm_info, dbdtype, (int)bpages,
704 maxpage, count, startindex,
705 start_blk, &up_reason);
706 maxpage = startindex + count;
707 VASSERT(maxpage <= startindex + maxpagein);
708 minpage = startindex - (startindex+file_offset) % bpages;
709 minpage = MAX(minpage, maxpage - maxpagein);
710 VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
711 minpage = vm_minpage(vm_info, minpage);
712 VASSERT(minpage <= startindex);
713 count = expand_faultin_down(vm_info, dbdtype, (int)bpages,
714 minpage, count, &startindex,
715 &start_blk, &down_reason);
716 VM_SET_IO_STARTINDX(vm_info, 0, startindex);
717 VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
718 VM_SET_IO_COUNT(vm_info, 0, count);
719 VM_SET_NUM_IO(vm_info, 1);
722 if (max_num_io > 1) {
724 * We need to set up multiple I/O information; beginning
725 * with the startindex, we will expand upwards. The expansion
726 * could stop for one of 2 reasons; we take the appropriate
727 * action in each of these cases:
728 * o VM reasons: abort setting up the multiple I/O
729 * information and return to our caller indicating
730 * that "retry" is required.
731 * o pagelimit: set up the next I/O info [we may have
732 * reached multio_maxpage at this point].
733 * Note that expansion involves no more than a block at a time;
734 * hence it could never stop due to "discontiguous block"
737 startindex = minpage = vm_minpage(vm_info, 0);
739 (indx < max_num_io) && (startindex < multio_maxpage);
740 indx++, startindex +=count) {
741 dbd = FINDDBD(prp->p_reg, startindex);
742 start_blk = dbd->dbd_data;
743 maxpage = startindex +
744 (bpages - (startindex+file_offset) % bpages);
745 maxpage = min(maxpage, multio_maxpage);
746 count = expand_faultin_up(vm_info, dbdtype,
747 bpages, maxpage, 1 /* count */,
748 startindex, start_blk, &up_reason);
749 VM_SET_IO_STARTINDX(vm_info, indx, startindex);
750 VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
751 VM_SET_IO_COUNT(vm_info, indx, count);
752 if (up_reason & VM_REASONS)
754 VASSERT(!(up_reason&NONCONTIGUOUS_BLOCK));
755 VASSERT(up_reason & PAGELIMIT);
757 if (startindex < multio_maxpage) {
758 VM_MULT_IO_FAILURE(vm_info);
759 VM_REINIT_FAULT_DBDVFD(vm_info);
760 return (0); /* retry */
763 VM_SET_NUM_IO(vm_info, indx);
767 * Tell VM where the I/O intends to start. This may be different
768 * from the faulting point.
771 VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
778 ********************************************************************
780 **** afspgin_blkflsh ()
781 **** similar to: nfspgin_blkflsh ()
782 ********************************************************************
791 pgcnt_t count = *num_4k;
794 int num_io = VM_GET_NUM_IO(vm_info);
797 * On this blkflush() we don't want to purge the buffer cache and we do
798 * want to wait, so the flags are '0'.
801 for (indx = 0; indx < num_io; indx++) {
802 flush_reslt = blkflush(devvp,
803 (daddr_t)VM_GET_IO_STARTBLK(vm_info, indx),
804 ptob(VM_GET_IO_COUNT(vm_info, indx)),
805 0, VM_REGION(vm_info));
808 if (vm_page_now_valid(vm_info, &page_count)) {
809 vm_release_memory(vm_info);
810 vm_release_structs(vm_info);
811 *num_4k = page_count;
812 return(VM_PAGE_PRESENT);
821 ********************************************************************
824 **** similar to: nfspgin_io ()
825 ********************************************************************
837 caddr_t vaddr = VM_ADDR(vm_info);
838 caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
839 pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
840 preg_t *prp = VM_PRP(vm_info);
841 int wrt = VM_WRT(vm_info);
842 space_t space = VM_SPACE(vm_info);
843 int num_io = VM_GET_NUM_IO(vm_info);
845 #ifdef notdef /* Not used in AFS */
847 * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
848 * be used in this case.
850 * Unlike UFS, NFS does not start the faulting page I/O
851 * asynchronously. Why? Asynchronous requests are handled by the
852 * biod's. It doesn't make sense to queue up the faulting request
853 * behind other asynchrnous requests. This is not true for UFS
854 * where the asynchrnous request is immediately handled.
857 if ((VM_READ_AHEAD_ALLOWED(vm_info)) &&
858 (nfs_read_ahead_on) &&
859 (NFS_DO_READ_AHEAD) &&
860 (should_do_read_ahead(prp, vaddr))) {
862 pgcnt_t max_rhead_io;
864 pgcnt_t total_rheads_allowed;
867 * Determine the maximum amount of read-ahead I/O.
869 total_rheads_allowed = maxpagein - count ;
872 * If the count is less than a block, raise it to one.
874 if (total_rheads_allowed < bpages)
875 total_rheads_allowed = bpages;
877 max_rhead_io = total_rheads_allowed;
878 rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count*NBPG);
879 error = nfs_read_ahead(vm_info->vp, prp, wrt, space,
880 rhead_vaddr, &max_rhead_io);
883 * Set the next fault location. If read_ahead launches any
884 * I/O it will adjust it accordingly.
886 vm_info->prp->p_nextfault = vm_info->startindex + count;
889 * Now perform the faulting I/O synchronously.
893 error = syncpageio((swblk_t)VM_GET_IO_STARTBLK(vm_info, 0),
894 VM_MAPPED_SPACE(vm_info),
895 VM_MAPPED_ADDR(vm_info),
896 (int)ptob(count), B_READ, devvp,
897 B_vfs_pagein|B_pagebf, VM_REGION(vm_info));
901 virt_addr = VM_MAPPED_ADDR(vm_info);
903 for (i = 0; i < num_io; i++) {
905 * REVISIT -- investigate doing asyncpageio().
907 error |= (io[i].error =
909 (swblk_t)VM_GET_IO_STARTBLK(vm_info, i),
910 VM_MAPPED_SPACE(vm_info),
912 (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
914 B_vfs_pagein|B_pagebf,
915 VM_REGION(vm_info)));
916 virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
919 * Set the next fault location. If read_ahead launches any
920 * I/O it will adjust it accordingly.
922 vm_info->prp->p_nextfault = vm_info->startindex + count;
929 ********************************************************************
931 **** afspgin_update_dbd ()
932 **** similar to: nfspgin_update_dbd ()
933 ********************************************************************
941 pgcnt_t count = bsize / NBPG;
946 int num_io = VM_GET_NUM_IO(vm_info);
949 for (i = 0; i < num_io; i++) {
951 pgindx = VM_GET_IO_STARTINDX(vm_info, i);
952 off = vnodindx(VM_REGION(vm_info), pgindx);
954 blkno = VM_GET_IO_STARTBLK(vm_info, i);
956 VASSERT(bsize % NBPG == 0);
957 VASSERT(rem % NBPG == 0);
959 pgindx -= (pgcnt_t)btop(rem);
960 blkno -= (daddr_t)btodb(rem);
963 * This region could start in mid-block. If so, pgindx
964 * could be less than 0, so we adjust pgindx and blkno back
965 * up so that pgindx is 0.
973 blkno += btodb(ptob(prem));
976 for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
977 m++, pgindx++, blkno += btodb(NBPG)) {
979 * Note: since this only changes one block, it
980 * assumes only one block was faulted in. Currently
981 * this is always true for remote files, and we only
982 * get here for remote files, so everything is ok.
984 vm_mark_dbd(vm_info, pgindx, blkno);
989 int afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
995 pgcnt_t *ret_startindex;
998 pgcnt_t pgindx = *ret_startindex;
1000 struct vnode *devvp;
1002 daddr_t start_blk=0;
1006 int shared; /* writable memory mapped file */
1007 retval_t retval = 0;
1008 pgcnt_t ok_dbd_limit = 0; /* last dbd that we can trust */
1009 pgcnt_t bpages; /* number of pages per block */
1011 vfspage_t* vm_info=NULL;
1018 int change_to_fstore = 0; /* need to change dbds to DBD_FSTORE */
1019 int flush_start_blk = 0;
1020 int flush_end_blk = 0;
1024 AFS_STATCNT(afs_pagein);
1025 vmemp_lockx(); /* lock down VM empire */
1027 /* Initialize the VM info structure */
1028 done = vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1031 /* Check to see if we slept and the page was falted in. */
1033 vm_release_structs(vm_info);
1037 vp = VM_GET_PAGEIN_VNODE(vm_info);
1038 VASSERT(vp != NULL);
1039 shared = VM_SHARED_OBJECT(vm_info);
1040 VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1043 * Get the devvp and block size for this vnode type
1046 bsize = vp->v_vfsp->vfs_bsize;
1047 if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1048 osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1050 bpages = (pgcnt_t)btop(bsize);
1051 VASSERT(bpages > 0);
1052 VM_SET_FS_MAX_PAGES(vm_info, bpages);
1054 /* this trace cannot be here because the afs_global lock might not be
1055 held at this point. We hold the vm global lock throughout
1056 this procedure ( and not the AFS global lock )
1057 afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1058 ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1059 ICL_TYPE_LONG, shared);
1061 /* Come here if we have to release the region lock before
1062 * locking pages. This can happen in memreserve() and
1067 * For remote files like ours, we want to check to see if the file has shrunk.
1068 * If so, we should invalidate any pages past the end. In the name
1069 * of efficiency, we only do this if the page we want to fault is
1070 * past the end of the file.
1073 if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1074 VM_ZOMBIE_OBJECT(vm_info);
1075 vm_release_memory(vm_info);
1076 vm_release_structs(vm_info);
1080 if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1082 * The file has shrunk and someone is trying to access a
1083 * page past the end of the object. Shrink the object back
1084 * to its currrent size, send a SIGBUS to the faulting
1085 * process and return.
1087 * We must release the region lock before calling mtrunc(),
1088 * since mtrunc() locks all the regions that are using this
1091 vm_release_memory(vm_info);
1092 vm_truncate_region(vm_info, isize);
1093 vm_release_structs(vm_info);
1094 vmemp_returnx(-SIGBUS);
1098 maxpagein = vm_pick_maxpagein(vm_info);
1099 if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1100 /* Check to see if we should continue faulting. */
1101 if (vm_page_now_valid(vm_info, &page_count)) {
1102 vm_release_memory(vm_info);
1103 vm_release_structs(vm_info);
1104 vmemp_returnx(page_count);
1107 if (count = vm_no_io_required(vm_info)) {
1108 /* Release any excess memory. */
1109 vm_release_memory(vm_info);
1110 vm_release_structs(vm_info);
1111 vmemp_returnx(count);
1116 * We should never have DBD_HOLE pages in a non-MMF region.
1119 VASSERT(dbd->dbd_type != DBD_HOLE);
1121 VASSERT( DBD_TYPE(vm_info) != DBD_NONE);
1123 startindex = *ret_startindex;
1126 * If the page we want is in memory already, take it
1128 if (VM_MEMORY_RESERVED(vm_info) < maxpagein)
1130 /* pick up the rest of memory now. */
1131 if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1132 if (vm_page_now_valid(vm_info, &page_count)) {
1133 vm_release_memory(vm_info);
1134 vm_release_structs(vm_info);
1135 vmemp_returnx(page_count);
1141 if (!(count = afspgin_setup_io_ranges(vm_info, bpages, isize,
1146 startindex = VM_GET_STARTINDX(vm_info);
1148 VASSERT(maxpagein >= count);
1151 * Release the memory we won't need.
1153 if (count < maxpagein) {
1154 vm_release_excess_memory(vm_info,
1155 (VM_MEMORY_RESERVED(vm_info) - count));
1158 retval = afspgin_blkflsh(vm_info, devvp, &count);
1160 if (retval == VM_RETRY) {
1164 if (retval == VM_PAGE_PRESENT)
1169 * The definition of krusage_cntr_t is in h/kmetric.h, which
1170 * is not shipped. Since it's just statistics, we punt and do
1171 * not update it. If it's a problem we'll need to get HP to export
1172 * an interface that we can use to increment the counter.
1175 /* It's a real fault, not a reclaim */
1177 krusage_cntr_t *temp;
1178 temp = kt_cntrp(u.u_kthreadp);
1184 * Tell VM where the I/O intends to start. This may be different
1185 * from the faulting point.
1189 * vm_prepare_io will fill the region with pages and release the
1192 vm_prepare_io(vm_info, &count);
1195 * Count may have been adjusted, check to make sure it's non-zero.
1198 if (vm_retry(vm_info)) {
1203 * Release resources and retry the fault. Release any excess
1207 vm_release_memory(vm_info);
1208 vm_release_structs(vm_info);
1212 error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1214 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1216 VM_ZOMBIE_OBJECT(vm_info);
1220 * For a writable memory mapped file that is remote we must
1221 * detect potential holes in the file and force allocation of
1222 * disk space on the remote system. Unfortunately, there is
1223 * no easy way to do this, so this gets a little ugly.
1225 if (shared && wrt) {
1227 * See if The user wants to write to this page. Write some
1228 * minimal amount of data back to the remote file to
1229 * force allocation of file space. We only need to
1230 * write a small amount, since holes are always at
1231 * least one filesystem block in size.
1233 error = vm_alloc_hole(vm_info);
1236 * If some sort of I/O error occurred we generate a
1237 * SIGBUS for the process that caused the write,
1238 * undo our page locks, etc and return.
1240 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1241 VM_ZOMBIE_OBJECT(vm_info);
1247 * Change these dbds to DBD_FSTORE. We cannot do it here,
1248 * since the region must be locked, and it is not locked
1249 * at the moment. We cannot lock the region yet, as we
1250 * first have to release the page locks.
1252 change_to_fstore = 1;
1255 vm_finish_io(vm_info, count);
1258 * Acquire the lock before we play around with changing the vfd's.
1262 if (change_to_fstore)
1263 afspgin_update_dbd(vm_info, bsize);
1265 mpproc_info[getprocindex()].cnt.v_exfod += count;
1266 vmemp_unlockx(); /* free up VM empire */
1267 *ret_startindex = startindex;
1270 * In case we have any excess memory...
1272 if (VM_MEMORY_RESERVED(vm_info))
1273 vm_release_memory(vm_info);
1274 vm_release_structs(vm_info);
1280 vm_finish_io_failed(vm_info, count);
1284 vm_undo_validation(vm_info, count);
1287 * In case we have any excess memory...
1289 if (VM_MEMORY_RESERVED(vm_info))
1290 vm_release_memory(vm_info);
1291 vm_release_structs(vm_info);
1293 vmemp_unlockx(); /* free up VM empire */
1298 afs_pageout(vp,prp, start, end, flags)
1299 struct vnode *vp; /* not used */
1305 struct vnode *filevp;
1306 struct vnode *devvp;
1311 int *piocnt; /* wakeup counter used if PAGEOUT_WAIT */
1312 struct ucred *old_cred;
1316 int inode_changed = 0;
1320 AFS_STATCNT(afs_pageout);
1322 steal = (flags & PAGEOUT_FREE);
1323 vhand = (flags & PAGEOUT_VHAND);
1324 hard = (flags & PAGEOUT_HARD);
1328 /* Initialize the VM info structure. */
1329 vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1332 * If the region is marked "don't swap", then don't steal any pages
1333 * from it. We can, however, write dirty pages out to disk (only if
1334 * PAGEOUT_FREE is not set).
1336 if (vm_no_pageout(&vm_info)) {
1342 * If caller wants to wait until the I/O is complete.
1344 vm_setup_wait_for_io(&vm_info);
1346 filevp = VM_GET_PAGEOUT_VNODE(&vm_info); /* always page out to back store */
1347 VASSERT(filevp != NULL);
1349 bzero((caddr_t)&args, sizeof(fsdata_t));
1350 args.remote_down = 0; /* assume remote file servers are up */
1351 args.remote = 1; /* we are remote */
1352 args.bsize = 0; /* filled up later by afs_vm_checkpage() */
1354 if (filevp->v_fstype == VUFS) {
1356 devvp = ip->i_devvp;
1364 * If we are vhand(), and this is an NFS file, we need to
1365 * see if the NFS server is "down". If so, we decide
1366 * if we will try to talk to it again, or defer pageouts
1367 * of dirty NFS pages until a future time.
1370 if (vhand && filevp->v_fstype == VNFS &&
1371 vtomi(filevp)->mi_down && vtomi(filevp)->mi_hard) {
1372 extern afs_int32 vhand_nfs_retry;
1374 * If there is still time left on our timer, we will
1375 * not talk to this server right now.
1377 if (vhand_nfs_retry > 0)
1378 args.remote_down = 1;
1384 * Initialize args. We set bsize to 0 to tell vfs_vfdcheck() that
1385 * it must get the file size and other attributes if it comes across
1388 vm_info.fs_data = (caddr_t)&args;
1390 /* this trace cannot be here because the afs_global lock might not be
1391 held at this point. We hold the vm global lock throughout
1392 this procedure ( and not the AFS global lock )
1393 afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1394 ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1406 extern int pageiodone();
1411 * Ask the VM system to find the next run of pages.
1413 vm_find_next_range(&vm_info, i, end);
1416 * It's possible that the remote file shrunk in size. Check the flags
1417 * to see if the request was beyond the end of the file. If it was,
1418 * truncate the region to the file size and continue. We could be on a
1419 * run so after trunction continue, there may be some I/O to write
1422 if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1423 pgcnt_t pglen = (pgcnt_t)btorp(args.isize);
1426 * This page is past the end of the file. Unlock this page
1427 * (region_trunc will throw it away) and then call
1428 * region_trunc() to invalidate all pages past the new end of
1431 region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1434 * remove the truncation flag.
1436 VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1439 if (VM_NO_PAGEOUT_RUN(&vm_info))
1443 * We have a run of dirty pages [args.start...args.end].
1445 VASSERT(filevp->v_fstype != VCDFS);
1446 VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1447 VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1450 * We will be doing an I/O on the region, let the VM system know.
1452 (void)vm_up_physio_count(&vm_info);
1455 * Okay, get set to perform the I/O.
1458 npages = (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1459 VM_START_PAGEOUT_INDX(&vm_info);
1462 * Allocate and initialize an I/O buffer.
1465 vm_init_bp(&vm_info, bp); /* Let the VM system initialize */
1467 /* Identify this buffer for KI */
1468 bp->b_bptype = B_vfs_pageout|B_pagebf;
1471 bp->b_flags = B_CALL|B_BUSY|B_PAGEOUT; /* steal pages */
1473 bp->b_flags = B_CALL|B_BUSY; /* keep pages */
1476 * If we are vhand paging over NFS, we will wait for the I/O
1479 if (vhand && filevp->v_fstype == VNFS) {
1480 bp->b_flags &= ~B_CALL;
1482 bp->b_iodone = (int (*)())pageiodone;
1486 * Make sure we do not write past the end of the file.
1488 nbytes = ptob(npages);
1489 start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1490 if (start + nbytes > args.isize) {
1493 * The amount we are off better not be bigger than a
1496 if (start + nbytes - args.isize >= args.bsize) {
1497 osi_Panic("afs_pageout: remainder too large");
1501 * Reset the size of the I/O as necessary. For remote
1502 * files, we set the size to the exact number of bytes to
1503 * the end of the file. For local files, we round this up
1504 * to the nearest DEV_BSIZE chunk since disk I/O must always
1505 * be in multiples of DEV_BSIZE. In this case, we do not
1506 * bother to zero out the data past the "real" end of the
1507 * file, this is done when the data is read (either through
1508 * mmap() or by normal file system access).
1511 nbytes = args.isize - start;
1513 nbytes = roundup(args.isize - start, DEV_BSIZE);
1517 * Now get ready to perform the I/O
1519 if (!vm_protect_pageout(&vm_info, npages))
1522 vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1523 vm_finish_io_failed(&vm_info, npages);
1528 * If this is an NFS write by vhand(), we will not be calling
1529 * pageiodone(). asyncpageio() increments parolemem for us
1530 * if bp->b_iodone is pageiodone, so we must do it manually
1531 * if pageiodone() will not be called automatically.
1533 if (!(bp->b_flags & B_CALL) && steal) {
1534 register ulong_t context;
1536 SPINLOCK_USAV(pfdat_lock, context);
1537 parolemem += btorp(nbytes);
1538 SPINUNLOCK_USAV(pfdat_lock, context);
1540 blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1541 (BX_NOBUFWAIT|BX_PURGE), VM_REGION(&vm_info));
1544 * If vhand is the one paging things out, and this is an NFS
1545 * file, we need to temporarily become a different user so
1546 * that we are not trying to page over NFS as root. We use
1547 * the user credentials associated with the writable file
1548 * pointer that is in the psuedo-vas for this MMF.
1550 * NOTE: we are currently using "va_rss" to store the ucred
1551 * value in the vas (this should be fixed in 10.0).
1553 old_cred = kt_cred(u.u_kthreadp);
1555 set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1558 * If root was the one who opened the mmf for write,
1559 * va_cred will be NULL. So reset kt_cred(u.u_kthreadp) to what it
1560 * was. We will page out as root, but that is the
1561 * correct thing to do in this case anyway.
1563 if (kt_cred(u.u_kthreadp) == NULL)
1564 set_kt_cred(u.u_kthreadp, old_cred);
1568 * Really do the I/O.
1570 error = asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1571 VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1572 (int)nbytes, B_WRITE, devvp);
1574 VASSERT(error == 0);
1578 * If we are vhand paging over NFS we want to wait for the
1579 * I/O to complete and take the appropriate actions if an
1580 * error is encountered.
1583 if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1585 * The server is down, ignore this failure, and
1586 * try again later. (rfscall() has set our retry
1589 fsdata.remote_down = 1;
1590 pageiocleanup(bp, 0);
1593 * vm_vfdcheck() has cleared the valid bit on the
1594 * vfds for these pages. We must go back and set the
1595 * valid bit, as the pages are really not gone.
1597 * NOTE: we can do this because we still hold (and have
1598 * not released) the region lock.
1601 vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1605 * The I/O succeeded, or we had an error that we do
1606 * not want to defer until later. Call pageidone()
1615 * And restore our credentials to what they were.
1617 set_kt_cred(u.u_kthreadp, old_cred);
1620 * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1621 * can now unreserve it.
1623 if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1624 vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1625 vm_release_malloc_memory();
1632 if (flags & PF_DEACT) {
1633 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1634 /* sar_bswapout += ptod(npages);*/
1637 mpproc_info[getprocindex()].cnt.v_pgout++;
1638 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1643 * If time and patience have delivered enough
1644 * pages, then quit now while we are ahead.
1646 if (VM_STOP_PAGING(&vm_info))
1649 i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1652 vm_finish_pageout(&vm_info); /* update vhand's stealscan */
1657 * If we wanted to wait for the I/O to complete, sleep on piocnt.
1658 * We must decrement it by one first, and then make sure that it
1659 * is non-zero before going to sleep.
1661 vm_wait_for_io(&vm_info);
1663 if (inode_changed && !file_is_remote) {
1664 imark(ip, IUPD|ICHG);
1671 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1672 struct vnode *filevp;
1674 daddr_t *bn; /* Block number. */
1675 int flags; /* B_READ or B_WRITE */
1676 int *hole; /* To be used for read-ahead. */
1677 pgcnt_t *startidx; /* To be used for read-ahead. */
1678 pgcnt_t *endidx; /* To be used for read-ahead. */
1680 daddr_t lbn, local_bn;
1683 long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1686 *startidx = (pgcnt_t)(offset/NBPG);
1688 *endidx = (pgcnt_t)(offset/NBPG);
1690 *hole = 0; /* Can't have holes. */
1692 osi_Panic("afs_mapdbd: zero size");
1694 lbn = (daddr_t)(offset / bsize);
1695 on = offset % bsize;
1697 err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1701 * We can never get a bn less than zero on remote files.
1703 VASSERT(local_bn >= 0);
1705 local_bn = local_bn + btodb(on);
1713 * 1: The blocks are contiguous.
1714 * 0: The blocks are not contiguous.
1717 afs_vm_fscontiguous(vp, args, cur_data)
1722 if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1731 * 1: Stop, this page is the last in the block.
1733 * Terminate requests at filesystem block boundaries
1735 afs_vm_stopio(vp, args)
1739 fsdata_t *fsdata = (fsdata_t *)args->fs_data;
1741 if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0) {
1749 * afs_vm_checkpage is called by the VM while collecting a run of
1750 * pages on a pageout. afs_vm_checkpage() is called for each page
1751 * VM wants to write to disk.
1753 afs_vm_checkpage(vp, args, pgindx, cur_data)
1759 fsdata_t *fsdata = (fsdata_t *)args->fs_data;
1761 if (fsdata->remote_down) { /* never happens for AFS */
1763 * The remote system is down.
1765 VASSERT(args->run == 0);
1769 * A dirty page. If we have not yet determined the file size and
1770 * other attributes that we need to write out pages (the block
1771 * size and ok_dbd_limit), get that information now.
1773 if (fsdata->bsize == 0) {
1777 struct vnode *filevp;
1779 * Get the various attributes about the file. Store them
1780 * in args for the next time around.
1784 bsize = vtoblksz(filevp);
1785 args->maxpgs = (pgcnt_t)btop(bsize);
1787 if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1789 * The VOP_GETATTR() failed.
1790 * we are vhand, and this is a hard mount, we will
1791 * skip dirty pages for a while and try again later.
1793 if (args->vm_flags & PAGEOUT_VHAND)
1795 VASSERT(args->run == 0);
1799 * This is a "soft" mount, or some other error was
1800 * returned from the server. Mark this region
1801 * as a zombie, and free this dirty page.
1803 VM_ZOMBIE_OBJECT(args);
1806 * The caller will see r_zomb and remove the page
1812 fsdata->isize = isize;
1813 fsdata->bsize = bsize;
1817 * See if the file has shrunk (this could have happened
1818 * asynchronously because of NFS or DUX). If so, invalidate
1819 * all of the pages past the end of the file. This is only
1820 * needed for remote files, as local files are truncated
1824 if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1826 * This page is past the end of the file. Unlock this page
1827 * (region_trunc will throw it away) and then call region_trunc()
1828 * to invalidate all pages past the new end of the file.
1830 VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1834 if ((args->vm_flags & PAGEOUT_VHAND) &&
1835 (!(args->vm_flags & PAGEOUT_RESERVED)) &&
1836 (!(VM_IS_ZOMBIE(args)))) {
1837 VASSERT(args->run == 0);
1838 if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1840 * Got enough memory to pageout. Mark the fact that we did
1841 * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1842 * later (in remote_pageout()).
1844 args->vm_flags |= PAGEOUT_RESERVED;
1847 * We do not have enough memory to do this pageout. By
1848 * definition, we do not yet have a run, so we just unlock
1849 * this page and tell foreach_valid() to continue scanning.
1850 * If we come across another dirty page, we will try to
1851 * reserve memory again. That is okay, in fact some memory
1852 * may have freed up (as earlier pageouts complete under
1869 fs_bsize = vtoblksz(bp->b_vp);
1871 * Check to see if we are starting mid block. If so, then
1872 * we must return the remainder of the block or less depending
1875 bnrem = bp->b_offset % fs_bsize;
1877 max_size = fs_bsize - bnrem;
1879 max_size = fs_bsize;
1882 if (bp->b_bcount > max_size) {
1885 return(bp->b_bcount);
1889 afs_mmap(vp, off, size_bytes, access)
1895 long bsize = vtoblksz(vp);
1897 if (bsize % NBPG != 0) {
1904 afs_cachelimit(vp, len, location)
1910 * Disk addresses are logical, not physical, so fragments are
1913 *location = btorp(len) + 1;
1923 afs_unmap(vp,off, size_bytes,access)
1933 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
1941 printf("afs_read_ahead returning 0 \n");
1946 afs_prealloc(vp, size, ignore_minfree, reserved)
1952 printf("afs_prealloc returning ENOSPC\n");
1957 afs_ioctl(vp, com, data, flag, cred)
1965 struct afs_ioctl afsioctl, *ai;
1967 AFS_STATCNT(afs_ioctl);
1969 /* The call must be a VICEIOCTL call */
1970 if (((com >> 8) & 0xff) == 'V') {
1972 /* AFS_COPYIN returns error 14. Copy data in instead */
1973 AFS_COPYIN(data, (caddr_t) &afsioctl, sizeof(afsioctl), error);
1974 if (error) return(error);
1976 ai = (struct afs_ioctl *) data;
1977 afsioctl.in = ai->in;
1978 afsioctl.out = ai->out;
1979 afsioctl.in_size = ai->in_size;
1980 afsioctl.out_size = ai->out_size;
1981 error = HandleIoctl((struct vcache *)vp, com, &afsioctl);
1987 #define roundtoint(x) (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
1988 #define reclen(dp) roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
1989 2 * sizeof(u_short)))
1992 afs_readdir(vp, uiop, cred)
1999 caddr_t ibuf, obuf, ibufend, obufend;
2000 struct __dirent32 *idp;
2002 int count, outcount;
2004 uint64_t tmp_offset;
2006 count = uiop->uio_resid;
2007 /* Allocate temporary space for format conversion */
2008 ibuf = kmem_alloc(2*count); /* overkill - fix later */
2009 obuf = kmem_alloc(count + sizeof (struct dirent));
2010 aiov.iov_base = ibuf;
2011 aiov.iov_len = count;
2012 auio.uio_iov = &aiov;
2013 auio.uio_iovcnt = 1;
2014 offset = auio.uio_offset = uiop->uio_offset;
2015 auio.uio_seg = UIOSEG_KERNEL;
2016 auio.uio_resid = count;
2017 auio.uio_fpflags = 0;
2019 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2023 /* Convert entries from __dirent32 to dirent format */
2025 for (idp = (struct __dirent32 *) ibuf, odp = (struct dirent *) obuf,
2026 ibufend = ibuf + (count - auio.uio_resid),
2027 obufend = obuf + count;
2028 (caddr_t)idp < ibufend;
2029 idp = (struct __dirent32 *) ((caddr_t) idp + idp->__d_reclen),
2030 odp = (struct dirent *) ((caddr_t) odp + odp->d_reclen)) {
2031 odp->d_ino = idp->__d_ino;
2032 odp->d_namlen = idp->__d_namlen;
2033 (void) strcpy(odp->d_name, idp->__d_name);
2034 odp->d_reclen = reclen(odp);
2035 if ((caddr_t) odp + odp->d_reclen > obufend)
2037 /* record offset *after* we're sure to use this entry */
2038 bcopy((char *)&idp->__d_off, (char *)&tmp_offset, sizeof tmp_offset);
2039 offset = tmp_offset;
2042 outcount = (caddr_t) odp - obuf;
2043 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2046 uiop->uio_offset = offset;
2048 kmem_free(ibuf, count);
2049 kmem_free(obuf, count + sizeof (struct dirent));
2054 #define roundtolong(x) (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2055 #define reclen_dirent64(dp) roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2056 2 * sizeof(u_short)))
2059 afs_readdir3(vp, uiop, cred)
2066 caddr_t ibuf, obuf, ibufend, obufend;
2067 struct __dirent32 *idp;
2068 struct __dirent64 *odp;
2069 int count, outcount;
2072 count = uiop->uio_resid;
2073 /* Allocate temporary space for format conversion */
2074 ibuf = kmem_alloc(2*count); /* overkill - fix later */
2075 obuf = kmem_alloc(count + sizeof (struct __dirent64));
2076 aiov.iov_base = ibuf;
2077 aiov.iov_len = count;
2078 auio.uio_iov = &aiov;
2079 auio.uio_iovcnt = 1;
2080 offset = auio.uio_offset = uiop->uio_offset;
2081 auio.uio_seg = UIOSEG_KERNEL;
2082 auio.uio_resid = count;
2083 auio.uio_fpflags = 0;
2085 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2089 /* Convert entries from __dirent32 to __dirent64 format */
2091 for (idp = (struct __dirent32 *) ibuf, odp = (struct __dirent64 *) obuf,
2092 ibufend = ibuf + (count - auio.uio_resid),
2093 obufend = obuf + count;
2094 (caddr_t)idp < ibufend;
2095 idp = (struct __dirent32 *) ((caddr_t) idp + idp->__d_reclen),
2096 odp = (struct __dirent64 *) ((caddr_t) odp + odp->__d_reclen)) {
2097 bcopy((char *)&idp->__d_off, (char *)&odp->__d_off, sizeof odp->__d_off);
2098 odp->__d_ino = idp->__d_ino;
2099 odp->__d_namlen = idp->__d_namlen;
2100 (void) strcpy(odp->__d_name, idp->__d_name);
2101 odp->__d_reclen = reclen_dirent64(odp);
2102 if ((caddr_t) odp + odp->__d_reclen > obufend)
2104 /* record offset *after* we're sure to use this entry */
2105 offset = odp->__d_off;
2108 outcount = (caddr_t) odp - obuf;
2109 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2112 uiop->uio_offset = offset;
2114 kmem_free(ibuf, count);
2115 kmem_free(obuf, count + sizeof (struct __dirent64));
2119 #define AFS_SV_SEMA_HASH 1
2120 #define AFS_SV_SEMA_HASH_DEBUG 0
2122 #if AFS_SV_SEMA_HASH
2123 /* This portion of the code was originally used to implement
2124 * thread specific storage for the semaphore save area. However,
2125 * there were some spare fields in the proc structure, this is
2126 * now being used for the saving semapores. Hence, this portion of
2127 * the code is no longer used.
2130 /* This portion of the code implements thread specific information.
2131 * The thread id is passed in as the key. The semaphore saved area
2132 * is hashed on this key.
2135 /* why is this hash table required ?
2136 * The AFS code is written in such a way that a GLOCK() is done in
2137 * one function and the GUNLOCK() is done in another function further
2138 * down the call chain. The GLOCK() call has to save the current
2139 * semaphore status before acquiring afs_global_sema. The GUNLOCK
2140 * has to release afs_global_sema and reacquire the sempahore status
2141 * that existed before the corresponding GLOCK. If GLOCK() and
2142 * GUNLOCK() were called in the same function, the GLOCK call could
2143 * have stored the saved sempahore status in a local variable and the
2144 * corresponding GUNLOCK() call could have restored the original
2145 * status from this local variable. But this is not the case with
2146 * AFS code. Hence, we have to implement a thread specific semaphore
2147 * save area. This is implemented as a hash table. The key is the
2151 /* In order for multithreaded processes to work, the sv_sema structures
2152 * must be saved on a per-thread basis, not a per-process basis. There
2153 * is no per-thread storage available to hijack in the OS per-thread
2154 * data structures (e.g. struct user) so we revive this code.
2155 * I removed the upper limit on the memory consumption since we don't
2156 * know how many threads there will be. Now the code first checks the
2157 * freeList. If that fails it then tries garbage collecting. If that
2158 * doesn't free up anything then it allocs what it needs.
2161 #define ELEMENT sv_sema_t
2163 #define Hash(xx) ( (xx) % sizeOfHashTable )
2164 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2165 #define hashLock(xx) MP_PSEMA(&xx)
2166 #define hashUnlock(xx) MP_VSEMA(&xx)
2176 typedef struct bucket
2182 static int sizeOfHashTable;
2183 static Bucket* hashTable;
2185 static int currentSize=0;
2186 static Element* freeList; /* free list */
2189 static sema_t afsHashLock = { 0 }; /* global lock for hash table */
2191 static void afsHashGarbageCollect();
2194 ** The global lock protects the global data structures,
2195 ** e.g. freeList and currentSize.
2196 ** The bucket lock protects the link list hanging off that bucket.
2197 ** The lock hierarchy : one can obtain the bucket lock while holding
2198 ** the global lock, but not vice versa.
2203 afsHash(int nbuckets) /* allocate the hash table */
2207 #if AFS_SV_SEMA_HASH_DEBUG
2208 printf("afsHash: enter\n");
2211 sizeOfHashTable = nbuckets;
2212 currentSize = nbuckets * sizeof(Bucket);
2215 osi_Panic("afs: SEMA Hashtable already created\n");
2217 hashTable = (Bucket *)AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2219 osi_Panic("afs: cannot create SEMA Hashtable\n");
2221 /* initialize the hash table and associated locks */
2222 bzero((char *)hashTable, sizeOfHashTable * sizeof(Bucket ));
2223 for ( i=0;i < sizeOfHashTable; i ++)
2224 hashLockInit( hashTable[i].lock);
2225 hashLockInit(afsHashLock);
2227 #if AFS_SV_SEMA_HASH_DEBUG
2228 printf("afsHash: exit\n");
2233 afsHashInsertFind(KEY key)
2238 #if AFS_SV_SEMA_HASH_DEBUG
2239 printf("afsHashInsertFind: %d\n", key);
2242 osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2244 index = Hash(key); /* get bucket number */
2245 hashLock(hashTable[index].lock); /* lock this bucket */
2246 ptr = hashTable[index].element;
2248 /* if it is already there */
2250 if ( ptr->key == key ) {
2251 ptr->refCnt++; /* hold it */
2252 hashUnlock(hashTable[index].lock);
2253 #if AFS_SV_SEMA_HASH_DEBUG
2254 printf("afsHashInsertFind: %d FOUND\n", key);
2256 return &(ptr->element);
2262 hashUnlock(hashTable[index].lock);
2264 /* if something exists in the freeList, take it from there */
2266 hashLock(afsHashLock);
2269 ptr = freeList; /* reuse entry */
2270 freeList = freeList->next;
2272 afsHashGarbageCollect(); /* afsHashLock locked */
2274 ptr = freeList; /* reuse entry */
2275 freeList = freeList->next;
2277 ptr = (Element *)AFS_KALLOC(sizeof(Element));
2281 currentSize += sizeof(Element); /* update memory used */
2282 hashUnlock(afsHashLock);
2285 osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2286 /* create new entry */
2288 bzero((char *)&ptr->element, sizeof(ptr->element));
2289 ptr->refCnt = 1; /* this guy */
2291 /* insert new entry in bucket */
2292 hashLock(hashTable[index].lock); /* lock this bucket */
2293 ptr->next = hashTable[index].element;
2294 hashTable[index].element = ptr;
2295 hashUnlock(hashTable[index].lock);
2297 #if AFS_SV_SEMA_HASH_DEBUG
2298 printf("afsHashInsertFind: %d MADE\n", key);
2301 return &(ptr->element);
2305 afsHashFind(KEY key)
2310 #if AFS_SV_SEMA_HASH_DEBUG
2311 printf("afsHashFind: %d\n", key);
2314 osi_Panic("afs: afsHashFind: no hashTable\n");
2316 index = Hash(key); /* get bucket number */
2317 hashLock(hashTable[index].lock); /* lock this bucket */
2318 ptr = hashTable[index].element;
2320 /* it should be in the hash table */
2322 if ( ptr->key == key )
2324 if(ptr->refCnt <= 0 )
2325 osi_Panic("afs: SEMA HashTable entry already released\n");
2326 hashUnlock(hashTable[index].lock);
2327 #if AFS_SV_SEMA_HASH_DEBUG
2328 printf("afsHashFind: %d FOUND\n", key);
2330 return &(ptr->element);
2336 hashUnlock(hashTable[index].lock);
2337 /* it better be in the hash table */
2338 osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2343 afsHashRelease(KEY key)
2348 #if AFS_SV_SEMA_HASH_DEBUG
2349 printf("afsHashRelease: %d\n", key);
2352 osi_Panic("afs: afsHashRelease: no hashTable\n");
2354 index = Hash(key); /* get bucket number */
2355 hashLock(hashTable[index].lock); /* lock this bucket */
2356 ptr = hashTable[index].element;
2358 /* it should be in the hash table */
2360 if ( ptr->key == key ) {
2361 if(ptr->refCnt <= 0 )
2362 osi_Panic("afs: SEMA HashTable entry already released\n");
2363 ptr->refCnt--; /* release this guy */
2364 hashUnlock(hashTable[index].lock);
2365 #if AFS_SV_SEMA_HASH_DEBUG
2366 printf("afsHashRelease: %d FOUND\n", key);
2374 hashUnlock(hashTable[index].lock);
2375 /* it better be in the hash table */
2376 osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2379 /* this should be called with afsHashLock WRITE locked */
2381 afsHashGarbageCollect()
2388 osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2390 for ( index = 0; index < sizeOfHashTable; index++) {
2391 hashLock(hashTable[index].lock);
2392 ptr = hashTable[index].element; /* pick up bucket */
2394 while ( ptr && !ptr->refCnt ) {
2395 /* insert this element into free list */
2398 ptr->next = freeList;
2401 foundFlag = 1; /* found at least one */
2402 currentSize -= sizeof(Element);
2405 hashTable[index].element = ptr;
2407 /* scan thru the remaining list */
2409 while ( ptr->next ) {
2410 if ( ptr->next->refCnt == 0 ) {
2411 /* collect this element */
2414 ptr->next = ptr->next->next;
2415 temp->next = freeList;
2418 currentSize -= sizeof(Element);
2424 hashUnlock(hashTable[index].lock);
2428 osi_Panic("afs: SEMA HashTable full\n");
2432 #endif /* AFS_SV_SEMA_HASH */
2436 register struct buf *bp;
2438 register afs_int32 code;
2440 struct iovec tiovec[1];
2441 extern caddr_t hdl_kmap_bp();
2442 register struct kthread *t = u.u_kthreadp;
2444 AFS_STATCNT(afs_hp_strategy);
2446 * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2447 * the I/O. We must save and restore the count because pageiodone()
2448 * uses b_bcount to determine how many pages to unlock.
2450 * Remap the entire range.
2455 afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER,
2456 bp->b_vp, ICL_TYPE_LONG,
2457 (int)bp->b_blkno*DEV_BSIZE, ICL_TYPE_LONG, bp->b_bcount,
2460 /* Set up the uio structure */
2461 tuio.afsio_iov = tiovec;
2462 tuio.afsio_iovcnt = 1;
2463 tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2464 tuio.afsio_seg = AFS_UIOSYS;
2465 tuio.afsio_resid = bp->b_bcount;
2466 tuio.uio_fpflags = 0;
2467 tiovec[0].iov_base = bp->b_un.b_addr;
2468 tiovec[0].iov_len = bp->b_bcount;
2471 if ((bp->b_flags & B_READ) == B_READ)
2473 /* read b_bcount bytes into kernel address b_un.b_addr
2474 starting at byte DEV_BSIZE * b_blkno. Bzero anything
2475 we can't read, and finally call iodone(bp). File is
2476 in bp->b_vp. Credentials are from u area??
2478 code = afs_rdwr((struct vcache *)bp->b_vp,&tuio,UIO_READ,0,kt_cred(t));
2480 if (tuio.afsio_resid > 0)
2482 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2483 bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2484 (size_t) tuio.afsio_resid);
2488 code = afs_rdwr((struct vcache *)bp->b_vp,&tuio,UIO_WRITE,0,kt_cred(t));
2490 /* Remap back to the user's space */
2499 afs_pathconf(vp, name, resultp, cred)
2503 struct ucred *cred; /* unused */
2507 case _PC_LINK_MAX: /* Maximum number of links to a file */
2508 *resultp = 255; /* an unsigned short on the fileserver*/
2509 break; /* a unsigned char in the client.... */
2511 case _PC_NAME_MAX: /* Max length of file name */
2515 case _PC_PATH_MAX: /* Maximum length of Path Name */
2519 case _PC_PIPE_BUF: /* Max atomic write to pipe. See fifo_vnops */
2520 case _PC_CHOWN_RESTRICTED: /* Anybody can chown? */
2521 case _PC_NO_TRUNC: /* No file name truncation on overflow? */
2522 u.u_error = EOPNOTSUPP;
2526 case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2527 /* need more work here for pty, ite buffer size, if differ */
2528 if (vp->v_type != VCHR) {
2532 *resultp = CANBSIZ; /*for tty*/
2536 /* need more work here for pty, ite buffer size, if differ */
2537 if (vp->v_type != VCHR) { /* TTY buffer size */
2541 *resultp = TTYHOG; /*for tty*/
2545 /* Terminal special characters can be disabled? */
2546 if (vp->v_type != VCHR) {
2554 if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2558 *resultp = 1; /* Synchronized IO supported for this file */
2561 case _PC_FILESIZEBITS:
2562 if (vp->v_type != VDIR)
2564 *resultp = MAX_SMALL_FILE_BITS;