2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
12 #include <afsconfig.h>
13 #include "afs/param.h"
18 #include "afs/sysincludes.h" /* Standard vendor system headers */
19 #include "afsincludes.h" /* Afs-based standard headers */
20 #include "afs/afs_stats.h" /* statistics stuff */
24 #include <sys/mount.h>
25 #include <sys/vnode.h>
26 #include <sys/pathname.h>
28 extern struct vfsops Afs_vfsops;
29 extern int afs_hp_strategy();
30 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
31 extern int afs_pagein();
32 extern int afs_pageout();
33 extern int afs_ioctl();
34 extern int afs_prealloc();
35 extern int afs_mapdbd();
36 extern int afs_mmap();
37 extern int afs_cachelimit();
38 extern int afs_vm_checkpage();
39 extern int afs_vm_fscontiguous();
40 extern int afs_vm_stopio();
41 extern int afs_read_ahead();
42 extern int afs_unmap();
43 extern int afs_release();
44 extern int afs_swapfs_len();
45 extern int afs_readdir2();
46 extern int afs_readdir();
47 extern int afs_readdir3();
48 extern int afs_pathconf();
49 extern int afs_close();
51 #define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
53 #if defined(AFS_HPUX110_ENV)
54 /* We no longer need to lock on the VM Empire,
55 * or at least that is what is claimed.
56 * so we will noopt the vmemp_ routines
57 * This needs to be looked at closer.
61 #define vmemp_returnx(a) return(a)
62 #define vmemp_unlockx()
65 #if !defined(AFS_HPUX110_ENV)
67 * Copy an mbuf to the contiguous area pointed to by cp.
68 * Skip <off> bytes and copy <len> bytes.
69 * Returns the number of bytes not transferred.
70 * The mbuf is NOT changed.
73 m_cpytoc(m, off, len, cp)
74 register struct mbuf *m;
75 register int off, len;
80 if (m == NULL || off < 0 || len < 0 || cp == NULL)
81 osi_Panic("m_cpytoc");
83 if (m->m_len <= off) {
92 ml = MIN(len, m->m_len - off);
93 memcpy(cp, mtod(m, caddr_t) + off, (u_int) ml);
100 memcpy(cp, mtod(m, caddr_t), (u_int) ml);
111 * Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
112 * totally new. This came about because HP-UX has lockf() implemented as
113 * a system call while Sun has it implemented as a library (apparently).
114 * To handle this, we have to translate the lockf() request into an
115 * fcntl() looking request, and then translate the results back if necessary.
116 * we call afs_lockctl() directly .
118 afs_lockf(vp, flag, len, cred, fp, LB, UB)
121 struct AFS_UCRED *cred;
125 /*for now, just pretend it works */
126 struct k_flock flock;
130 * Create a flock structure and translate the lockf request
131 * into an appropriate looking fcntl() type request for afs_lockctl()
135 flock.l_start = fp->f_offset;
136 /* convert negative lengths to positive */
137 if (flock.l_len < 0) {
138 flock.l_start += flock.l_len;
139 flock.l_len = -(flock.l_len);
142 * Adjust values to look like fcntl() requests.
143 * All locks are write locks, only F_LOCK requests
144 * are blocking. F_TEST has to be translated into
145 * a get lock and then back again.
147 flock.l_type = F_WRLCK;
151 flock.l_type = F_UNLCK;
160 u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
162 return (u.u_error); /* some other error code */
165 * if request is F_TEST, and GETLK changed
166 * the lock type to ULOCK, then return 0, else
167 * set errno to EACCESS and return.
169 if (flag == F_TEST && flock.l_type != F_UNLCK) {
177 #if defined(AFS_HPUX1122_ENV)
178 #include "machine/vm/vmparam.h"
180 #include "../machine/vmparam.h" /* For KERNELSPACE */
184 #if !defined(AFS_HPUX1123_ENV)
185 /* 11.23 is using 64 bit in many cases */
186 #define kern_daddr_t daddr_t
191 #include "ufs/inode.h"
194 #if defined(AFS_HPUX1123_ENV)
196 #endif /* AFS_HPUX1123_ENV */
198 #include "h/region.h"
199 #include "h/pregion.h"
200 #include "h/vmmeter.h"
202 #include "h/sysinfo.h"
204 #if !defined(AFS_HPUX1123_ENV)
205 #include "h/tuneable.h"
208 #include "netinet/in.h"
210 /* a freelist of one */
211 struct buf *afs_bread_freebp = 0;
214 * Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
215 * Thus we can use fake bufs (ie not from the real buffer pool).
217 afs_bread(vp, lbn, bpp)
222 int offset, fsbsize, error;
227 AFS_STATCNT(afs_bread);
228 fsbsize = vp->v_vfsp->vfs_bsize;
229 offset = lbn * fsbsize;
230 if (afs_bread_freebp) {
231 bp = afs_bread_freebp;
232 afs_bread_freebp = 0;
234 bp = (struct buf *)AFS_KALLOC(sizeof(*bp));
235 bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
238 iov.iov_base = bp->b_un.b_addr;
239 iov.iov_len = fsbsize;
240 uio.afsio_iov = &iov;
241 uio.afsio_iovcnt = 1;
242 uio.afsio_seg = AFS_UIOSYS;
243 uio.afsio_offset = offset;
244 uio.afsio_resid = fsbsize;
248 error = afs_read(VTOAFS(vp), &uio, p_cred(u.u_procp), lbn, bpp, 0);
250 afs_bread_freebp = bp;
254 afs_bread_freebp = bp;
256 *(struct buf **)&bp->b_vp = bp; /* mark as fake */
266 AFS_STATCNT(afs_brelse);
268 if ((struct buf *)bp->b_vp != bp) { /* not fake */
269 ufs_brelse(bp->b_vp, bp);
270 } else if (afs_bread_freebp) {
271 AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
272 AFS_KFREE(bp, sizeof(*bp));
274 afs_bread_freebp = bp;
279 afs_bmap(avc, abn, anvp, anbn)
280 register struct vcache *avc;
281 kern_daddr_t abn, *anbn;
282 struct vcache **anvp;
284 AFS_STATCNT(afs_bmap);
288 *anbn = abn * (8192 / DEV_BSIZE); /* in 512 byte units */
292 afs_inactive(avc, acred)
293 register struct vcache *avc;
294 struct AFS_UCRED *acred;
296 struct vnode *vp = AFSTOV(avc);
299 if (afs_shuttingdown)
303 * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
304 * v_count 1 on last reference!
306 MP_H_SPINLOCK_USAV(vn_h_sl_pool, vp, &sv_lock, &context);
307 if (avc->vrefCount < 1)
308 osi_Panic("afs_inactive : v_count < 1\n");
311 * If more than 1 don't unmap the vnode but do decrement the ref count
314 if (vp->v_count > 0) {
315 MP_SPINUNLOCK_USAV(sv_lock, context);
318 MP_SPINUNLOCK_USAV(sv_lock, context);
319 afs_InactiveVCache(avc, acred);
325 mp_afs_open(register struct vnode **avcp, int aflags, struct AFS_UCRED *acred)
330 code = afs_open(avcp, aflags, acred);
336 mp_afs_close(register struct vnode *avcp, int aflags, struct AFS_UCRED *acred)
341 code = afs_close(avcp, aflags, acred);
347 mp_afs_rdwr(register struct vnode *avcp, struct uio *uio, enum uio_rw arw,
348 int aio, struct AFS_UCRED *acred)
354 save_resid = uio->uio_resid;
355 code = afs_rdwr(avcp, uio, arw, aio, acred);
356 if (arw == UIO_WRITE && code == ENOSPC) {
357 /* HP clears code if any data written. */
358 uio->uio_resid = save_resid;
365 mp_afs_getattr(register struct vnode *avcp, struct vattr *attrs,
366 struct AFS_UCRED *acred, enum vsync unused1)
371 code = afs_getattr(avcp, attrs, acred);
377 mp_afs_setattr(register struct vnode *avcp, register struct vattr *attrs,
378 struct AFS_UCRED *acred, int unused1)
383 code = afs_setattr(avcp, attrs, acred);
389 mp_afs_access(register struct vnode *avcp, int mode, struct AFS_UCRED *acred)
394 code = afs_access(avcp, mode, acred);
400 mp_afs_lookup(register struct vnode *adp, char *aname,
401 register struct vnode **avcp, struct AFS_UCRED *acred,
402 struct vnode *unused1)
407 code = afs_lookup(adp, aname, avcp, acred);
413 mp_afs_create(register struct vnode *adp, char *aname, struct vattr *attrs,
414 enum vcexcl aexcl, int amode, struct vnode **avcp,
415 struct AFS_UCRED *acred)
420 code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
427 mp_afs_remove(register struct vnode *adp, char *aname,
428 struct AFS_UCRED *acred)
433 code = afs_remove(adp, aname, acred);
439 mp_afs_link(register struct vnode *avc, register struct vnode *adp,
440 char *aname, struct AFS_UCRED *acred)
445 code = afs_link(avc, adp, aname, acred);
451 mp_afs_rename(register struct vnode *aodp, char *aname1,
452 register struct vnode *andp, char *aname2,
453 struct AFS_UCRED *acred)
458 code = afs_rename(aodp, aname1, andp, aname2, acred);
464 mp_afs_mkdir(register struct vnode *adp, char *aname, struct vattr *attrs,
465 register struct vnode **avcp, struct AFS_UCRED *acred)
470 code = afs_mkdir(adp, aname, attrs, avcp, acred);
477 mp_afs_rmdir(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
482 code = afs_rmdir(adp, aname, acred);
489 mp_afs_readdir(register struct vnode *avc, struct uio *auio,
490 struct AFS_UCRED *acred)
495 code = afs_readdir(avc, auio, acred);
501 mp_afs_symlink(register struct vnode *adp, char *aname, struct vattr *attrs,
502 char *atargetName, struct AFS_UCRED *acred)
507 code = afs_symlink(adp, aname, attrs, atargetName, acred);
514 mp_afs_readlink(register struct vnode *avc, struct uio *auio,
515 struct AFS_UCRED *acred)
520 code = afs_readlink(avc, auio, acred);
526 mp_afs_fsync(register struct vnode *avc, struct AFS_UCRED *acred, int unused1)
531 code = afs_fsync(avc, acred);
537 mp_afs_bread(register struct vnode *avc, kern_daddr_t lbn, struct buf **bpp,
538 struct vattr *unused1, struct ucred *unused2)
543 code = afs_bread(avc, lbn, bpp);
549 mp_afs_brelse(register struct vnode *avc, struct buf *bp)
554 code = afs_brelse(avc, bp);
561 mp_afs_inactive(register struct vnode *avc, struct AFS_UCRED *acred)
566 code = afs_inactive(avc, acred);
572 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd,
573 struct AFS_UCRED *acred, struct file *unused1, off_t unused2,
579 code = afs_lockctl(avc, af, cmd, acred);
585 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
590 code = afs_fid(avc, fidpp);
596 mp_afs_readdir2(register struct vnode *avc, struct uio *auio,
597 struct AFS_UCRED *acred)
602 code = afs_readdir2(avc, auio, acred);
608 struct vnodeops Afs_vnodeops = {
631 #if !defined(AFS_NONFSTRANS)
632 /* on HPUX102 the nfs translator calls afs_bread but does
633 * not call afs_brelse. Hence we see a memory leak. If the
634 * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
635 * the same data : this is the path we follow now. */
642 afs_badop, /* pathsend */
643 afs_noop, /* setacl */
644 afs_noop, /* getacl */
648 afs_lockf, /* lockf */
671 struct vnodeops *afs_ops = &Afs_vnodeops;
673 /* vnode file operations, and our own */
675 extern int vno_ioctl();
676 extern int vno_select();
677 extern int afs_closex();
678 extern int vno_close();
679 struct fileops afs_fileops = {
686 #define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
689 ********************************************************************
691 **** afspgin_setup_io_ranges ()
692 **** similar to: nfspgin_setup_io_ranges ()
693 ********************************************************************
696 afspgin_setup_io_ranges(vfspage_t * vm_info, pgcnt_t bpages, k_off_t isize,
699 pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
700 pgcnt_t minpage; /* first page to bring in */
701 pgcnt_t maxpage; /* one past last page to bring in */
703 pgcnt_t multio_maxpage;
704 kern_daddr_t start_blk;
706 expnd_flags_t up_reason, down_reason;
713 VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
716 * We do not go past the end of the current pregion nor past the end
717 * of the current file.
720 maxpage = startindex + (bpages - (startindex + file_offset) % bpages);
721 maxpage = vm_reset_maxpage(vm_info, maxpage);
722 maxpage = MIN(maxpage, (pgcnt_t) btorp(isize) - file_offset);
723 maxpage = MIN(maxpage, startindex + maxpagein);
724 multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
729 VASSERT(maxpage >= startindex);
732 * Expanding the fault will create calls to FINDENTRY() for new
733 * pages, which will obsolete "dbd", so copy what it points to
734 * and clear it to prevent using stale data.
737 prp = VM_PRP(vm_info);
738 dbdtype = DBD_TYPE(vm_info);
739 start_blk = DBD_DATA(vm_info);
742 VASSERT(dbdtype != DBD_NONE);
744 if (max_num_io == 1) {
746 * We need to set up one I/O: First we attempt to expand the
747 * I/O forward. Then we expand the I/O backwards.
750 expand_faultin_up(vm_info, dbdtype, (int)bpages, maxpage, count,
751 startindex, start_blk, &up_reason);
752 maxpage = startindex + count;
753 VASSERT(maxpage <= startindex + maxpagein);
754 minpage = startindex - (startindex + file_offset) % bpages;
755 minpage = MAX(minpage, maxpage - maxpagein);
756 VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
757 minpage = vm_minpage(vm_info, minpage);
758 VASSERT(minpage <= startindex);
760 expand_faultin_down(vm_info, dbdtype, (int)bpages, minpage, count,
761 &startindex, &start_blk, &down_reason);
762 VM_SET_IO_STARTINDX(vm_info, 0, startindex);
763 VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
764 VM_SET_IO_COUNT(vm_info, 0, count);
765 VM_SET_NUM_IO(vm_info, 1);
768 if (max_num_io > 1) {
770 * We need to set up multiple I/O information; beginning
771 * with the startindex, we will expand upwards. The expansion
772 * could stop for one of 2 reasons; we take the appropriate
773 * action in each of these cases:
774 * o VM reasons: abort setting up the multiple I/O
775 * information and return to our caller indicating
776 * that "retry" is required.
777 * o pagelimit: set up the next I/O info [we may have
778 * reached multio_maxpage at this point].
779 * Note that expansion involves no more than a block at a time;
780 * hence it could never stop due to "discontiguous block"
783 startindex = minpage = vm_minpage(vm_info, 0);
784 for (indx = 0; (indx < max_num_io) && (startindex < multio_maxpage);
785 indx++, startindex += count) {
786 dbd = FINDDBD(prp->p_reg, startindex);
787 start_blk = dbd->dbd_data;
789 startindex + (bpages - (startindex + file_offset) % bpages);
790 maxpage = min(maxpage, multio_maxpage);
792 expand_faultin_up(vm_info, dbdtype, bpages, maxpage,
794 startindex, start_blk, &up_reason);
795 VM_SET_IO_STARTINDX(vm_info, indx, startindex);
796 VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
797 VM_SET_IO_COUNT(vm_info, indx, count);
798 if (up_reason & VM_REASONS)
800 VASSERT(!(up_reason & NONCONTIGUOUS_BLOCK));
801 VASSERT(up_reason & PAGELIMIT);
803 if (startindex < multio_maxpage) {
804 VM_MULT_IO_FAILURE(vm_info);
805 VM_REINIT_FAULT_DBDVFD(vm_info);
806 return (0); /* retry */
809 VM_SET_NUM_IO(vm_info, indx);
813 * Tell VM where the I/O intends to start. This may be different
814 * from the faulting point.
817 VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
824 ********************************************************************
826 **** afspgin_blkflsh ()
827 **** similar to: nfspgin_blkflsh ()
828 ********************************************************************
831 afspgin_blkflsh(vfspage_t * vm_info, struct vnode * devvp, pgcnt_t * num_4k)
834 pgcnt_t count = *num_4k;
837 int num_io = VM_GET_NUM_IO(vm_info);
840 * On this blkflush() we don't want to purge the buffer cache and we do
841 * want to wait, so the flags are '0'.
844 for (indx = 0; indx < num_io; indx++) {
846 blkflush(devvp, (kern_daddr_t) VM_GET_IO_STARTBLK(vm_info, indx),
847 ptob(VM_GET_IO_COUNT(vm_info, indx)), 0,
851 if (vm_page_now_valid(vm_info, &page_count)) {
852 vm_release_memory(vm_info);
853 vm_release_structs(vm_info);
854 *num_4k = page_count;
855 return (VM_PAGE_PRESENT);
864 ********************************************************************
867 **** similar to: nfspgin_io ()
868 ********************************************************************
871 afspgin_io(vfspage_t * vm_info, struct vnode *devvp, pgcnt_t bpages,
872 pgcnt_t maxpagein, pgcnt_t count)
876 caddr_t vaddr = VM_ADDR(vm_info);
877 caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
878 pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
879 preg_t *prp = VM_PRP(vm_info);
880 int wrt = VM_WRT(vm_info);
881 space_t space = VM_SPACE(vm_info);
882 int num_io = VM_GET_NUM_IO(vm_info);
884 #ifdef notdef /* Not used in AFS */
886 * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
887 * be used in this case.
889 * Unlike UFS, NFS does not start the faulting page I/O
890 * asynchronously. Why? Asynchronous requests are handled by the
891 * biod's. It doesn't make sense to queue up the faulting request
892 * behind other asynchrnous requests. This is not true for UFS
893 * where the asynchrnous request is immediately handled.
896 if ((VM_READ_AHEAD_ALLOWED(vm_info)) && (nfs_read_ahead_on)
897 && (NFS_DO_READ_AHEAD) && (should_do_read_ahead(prp, vaddr))) {
899 pgcnt_t max_rhead_io;
901 pgcnt_t total_rheads_allowed;
904 * Determine the maximum amount of read-ahead I/O.
906 total_rheads_allowed = maxpagein - count;
909 * If the count is less than a block, raise it to one.
911 if (total_rheads_allowed < bpages)
912 total_rheads_allowed = bpages;
914 max_rhead_io = total_rheads_allowed;
915 rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count * NBPG);
917 nfs_read_ahead(vm_info->vp, prp, wrt, space, rhead_vaddr,
921 * Set the next fault location. If read_ahead launches any
922 * I/O it will adjust it accordingly.
924 vm_info->prp->p_nextfault = vm_info->startindex + count;
927 * Now perform the faulting I/O synchronously.
932 syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, 0),
933 VM_MAPPED_SPACE(vm_info), VM_MAPPED_ADDR(vm_info),
934 (int)ptob(count), B_READ, devvp,
935 B_vfs_pagein | B_pagebf, VM_REGION(vm_info));
939 virt_addr = VM_MAPPED_ADDR(vm_info);
941 for (i = 0; i < num_io; i++) {
943 * REVISIT -- investigate doing asyncpageio().
945 error |= (io[i].error =
946 syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, i),
947 VM_MAPPED_SPACE(vm_info), virt_addr,
948 (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
949 B_READ, devvp, B_vfs_pagein | B_pagebf,
950 VM_REGION(vm_info)));
951 virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
954 * Set the next fault location. If read_ahead launches any
955 * I/O it will adjust it accordingly.
957 vm_info->prp->p_nextfault = vm_info->startindex + count;
964 ********************************************************************
966 **** afspgin_update_dbd ()
967 **** similar to: nfspgin_update_dbd ()
968 ********************************************************************
971 afspgin_update_dbd(vfspage_t * vm_info, int bsize)
974 pgcnt_t count = bsize / NBPG;
979 int num_io = VM_GET_NUM_IO(vm_info);
982 for (i = 0; i < num_io; i++) {
984 pgindx = VM_GET_IO_STARTINDX(vm_info, i);
985 off = vnodindx(VM_REGION(vm_info), pgindx);
987 blkno = VM_GET_IO_STARTBLK(vm_info, i);
989 VASSERT(bsize % NBPG == 0);
990 VASSERT(rem % NBPG == 0);
992 pgindx -= (pgcnt_t) btop(rem);
993 blkno -= (kern_daddr_t) btodb(rem);
996 * This region could start in mid-block. If so, pgindx
997 * could be less than 0, so we adjust pgindx and blkno back
998 * up so that pgindx is 0.
1006 blkno += btodb(ptob(prem));
1009 for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
1010 m++, pgindx++, blkno += btodb(NBPG)) {
1012 * Note: since this only changes one block, it
1013 * assumes only one block was faulted in. Currently
1014 * this is always true for remote files, and we only
1015 * get here for remote files, so everything is ok.
1017 vm_mark_dbd(vm_info, pgindx, blkno);
1023 afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
1029 pgcnt_t *ret_startindex;
1032 pgcnt_t pgindx = *ret_startindex;
1034 struct vnode *devvp;
1036 kern_daddr_t start_blk = 0;
1040 int shared; /* writable memory mapped file */
1041 retval_t retval = 0;
1042 pgcnt_t ok_dbd_limit = 0; /* last dbd that we can trust */
1043 pgcnt_t bpages; /* number of pages per block */
1045 vfspage_t *vm_info = NULL;
1052 int change_to_fstore = 0; /* need to change dbds to DBD_FSTORE */
1053 int flush_start_blk = 0;
1054 int flush_end_blk = 0;
1058 AFS_STATCNT(afs_pagein);
1059 vmemp_lockx(); /* lock down VM empire */
1061 /* Initialize the VM info structure */
1063 vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1066 /* Check to see if we slept and the page was falted in. */
1068 vm_release_structs(vm_info);
1072 vp = VM_GET_PAGEIN_VNODE(vm_info);
1073 VASSERT(vp != NULL);
1074 shared = VM_SHARED_OBJECT(vm_info);
1075 VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1078 * Get the devvp and block size for this vnode type
1081 bsize = vp->v_vfsp->vfs_bsize;
1082 if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1083 osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1085 bpages = (pgcnt_t) btop(bsize);
1086 VASSERT(bpages > 0);
1087 VM_SET_FS_MAX_PAGES(vm_info, bpages);
1089 /* this trace cannot be here because the afs_global lock might not be
1090 * held at this point. We hold the vm global lock throughout
1091 * this procedure ( and not the AFS global lock )
1092 * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1093 * ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1094 * ICL_TYPE_LONG, shared);
1096 /* Come here if we have to release the region lock before
1097 * locking pages. This can happen in memreserve() and
1102 * For remote files like ours, we want to check to see if the file has shrunk.
1103 * If so, we should invalidate any pages past the end. In the name
1104 * of efficiency, we only do this if the page we want to fault is
1105 * past the end of the file.
1108 if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1109 VM_ZOMBIE_OBJECT(vm_info);
1110 vm_release_memory(vm_info);
1111 vm_release_structs(vm_info);
1115 if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1117 * The file has shrunk and someone is trying to access a
1118 * page past the end of the object. Shrink the object back
1119 * to its currrent size, send a SIGBUS to the faulting
1120 * process and return.
1122 * We must release the region lock before calling mtrunc(),
1123 * since mtrunc() locks all the regions that are using this
1126 vm_release_memory(vm_info);
1127 vm_truncate_region(vm_info, isize);
1128 vm_release_structs(vm_info);
1129 vmemp_returnx(-SIGBUS);
1133 maxpagein = vm_pick_maxpagein(vm_info);
1134 if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1135 /* Check to see if we should continue faulting. */
1136 if (vm_page_now_valid(vm_info, &page_count)) {
1137 vm_release_memory(vm_info);
1138 vm_release_structs(vm_info);
1139 vmemp_returnx(page_count);
1142 if (count = vm_no_io_required(vm_info)) {
1143 /* Release any excess memory. */
1144 vm_release_memory(vm_info);
1145 vm_release_structs(vm_info);
1146 vmemp_returnx(count);
1150 * We should never have DBD_HOLE pages in a non-MMF region.
1153 VASSERT(dbd->dbd_type != DBD_HOLE);
1155 VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1157 startindex = *ret_startindex;
1160 * If the page we want is in memory already, take it
1162 if (VM_MEMORY_RESERVED(vm_info) < maxpagein) {
1163 /* pick up the rest of memory now. */
1164 if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1165 if (vm_page_now_valid(vm_info, &page_count)) {
1166 vm_release_memory(vm_info);
1167 vm_release_structs(vm_info);
1168 vmemp_returnx(page_count);
1176 afspgin_setup_io_ranges(vm_info, bpages, isize, startindex))) {
1180 startindex = VM_GET_STARTINDX(vm_info);
1182 VASSERT(maxpagein >= count);
1185 * Release the memory we won't need.
1187 if (count < maxpagein) {
1188 vm_release_excess_memory(vm_info,
1189 (VM_MEMORY_RESERVED(vm_info) - count));
1192 retval = afspgin_blkflsh(vm_info, devvp, &count);
1194 if (retval == VM_RETRY) {
1198 if (retval == VM_PAGE_PRESENT)
1203 * The definition of krusage_cntr_t is in h/kmetric.h, which
1204 * is not shipped. Since it's just statistics, we punt and do
1205 * not update it. If it's a problem we'll need to get HP to export
1206 * an interface that we can use to increment the counter.
1209 /* It's a real fault, not a reclaim */
1211 krusage_cntr_t *temp;
1212 temp = kt_cntrp(u.u_kthreadp);
1218 * Tell VM where the I/O intends to start. This may be different
1219 * from the faulting point.
1223 * vm_prepare_io will fill the region with pages and release the
1226 vm_prepare_io(vm_info, &count);
1229 * Count may have been adjusted, check to make sure it's non-zero.
1232 if (vm_retry(vm_info)) {
1237 * Release resources and retry the fault. Release any excess
1241 vm_release_memory(vm_info);
1242 vm_release_structs(vm_info);
1246 error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1248 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1250 VM_ZOMBIE_OBJECT(vm_info);
1254 * For a writable memory mapped file that is remote we must
1255 * detect potential holes in the file and force allocation of
1256 * disk space on the remote system. Unfortunately, there is
1257 * no easy way to do this, so this gets a little ugly.
1259 if (shared && wrt) {
1261 * See if The user wants to write to this page. Write some
1262 * minimal amount of data back to the remote file to
1263 * force allocation of file space. We only need to
1264 * write a small amount, since holes are always at
1265 * least one filesystem block in size.
1267 error = vm_alloc_hole(vm_info);
1270 * If some sort of I/O error occurred we generate a
1271 * SIGBUS for the process that caused the write,
1272 * undo our page locks, etc and return.
1274 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1275 VM_ZOMBIE_OBJECT(vm_info);
1281 * Change these dbds to DBD_FSTORE. We cannot do it here,
1282 * since the region must be locked, and it is not locked
1283 * at the moment. We cannot lock the region yet, as we
1284 * first have to release the page locks.
1286 change_to_fstore = 1;
1289 vm_finish_io(vm_info, count);
1292 * Acquire the lock before we play around with changing the vfd's.
1296 if (change_to_fstore)
1297 afspgin_update_dbd(vm_info, bsize);
1299 #if defined(AFS_HPUX110_ENV)
1300 getppdp()->cnt.v_exfod += count;
1302 mpproc_info[getprocindex()].cnt.v_exfod += count;
1304 vmemp_unlockx(); /* free up VM empire */
1305 *ret_startindex = startindex;
1308 * In case we have any excess memory...
1310 if (VM_MEMORY_RESERVED(vm_info))
1311 vm_release_memory(vm_info);
1312 vm_release_structs(vm_info);
1318 vm_finish_io_failed(vm_info, count);
1322 vm_undo_validation(vm_info, count);
1325 * In case we have any excess memory...
1327 if (VM_MEMORY_RESERVED(vm_info))
1328 vm_release_memory(vm_info);
1329 vm_release_structs(vm_info);
1331 vmemp_unlockx(); /* free up VM empire */
1336 afs_pageout(vp, prp, start, end, flags)
1337 struct vnode *vp; /* not used */
1343 struct vnode *filevp;
1344 struct vnode *devvp;
1349 int *piocnt; /* wakeup counter used if PAGEOUT_WAIT */
1350 struct ucred *old_cred;
1354 int inode_changed = 0;
1358 AFS_STATCNT(afs_pageout);
1360 steal = (flags & PAGEOUT_FREE);
1361 vhand = (flags & PAGEOUT_VHAND);
1362 hard = (flags & PAGEOUT_HARD);
1366 /* Initialize the VM info structure. */
1367 vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1370 * If the region is marked "don't swap", then don't steal any pages
1371 * from it. We can, however, write dirty pages out to disk (only if
1372 * PAGEOUT_FREE is not set).
1374 if (vm_no_pageout(&vm_info)) {
1380 * If caller wants to wait until the I/O is complete.
1382 vm_setup_wait_for_io(&vm_info);
1384 filevp = VM_GET_PAGEOUT_VNODE(&vm_info); /* always page out to back store */
1385 VASSERT(filevp != NULL);
1387 memset((caddr_t) & args, 0, sizeof(fsdata_t));
1388 args.remote_down = 0; /* assume remote file servers are up */
1389 args.remote = 1; /* we are remote */
1390 args.bsize = 0; /* filled up later by afs_vm_checkpage() */
1392 if (filevp->v_fstype == VUFS) {
1394 devvp = ip->i_devvp;
1401 * If we are vhand(), and this is an NFS file, we need to
1402 * see if the NFS server is "down". If so, we decide
1403 * if we will try to talk to it again, or defer pageouts
1404 * of dirty NFS pages until a future time.
1407 if (vhand && filevp->v_fstype == VNFS && vtomi(filevp)->mi_down
1408 && vtomi(filevp)->mi_hard) {
1409 extern afs_int32 vhand_nfs_retry;
1411 * If there is still time left on our timer, we will
1412 * not talk to this server right now.
1414 if (vhand_nfs_retry > 0)
1415 args.remote_down = 1;
1421 * Initialize args. We set bsize to 0 to tell vfs_vfdcheck() that
1422 * it must get the file size and other attributes if it comes across
1425 vm_info.fs_data = (caddr_t) & args;
1427 /* this trace cannot be here because the afs_global lock might not be
1428 * held at this point. We hold the vm global lock throughout
1429 * this procedure ( and not the AFS global lock )
1430 * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1431 * ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1443 extern int pageiodone();
1448 * Ask the VM system to find the next run of pages.
1450 vm_find_next_range(&vm_info, i, end);
1453 * It's possible that the remote file shrunk in size. Check the flags
1454 * to see if the request was beyond the end of the file. If it was,
1455 * truncate the region to the file size and continue. We could be on a
1456 * run so after trunction continue, there may be some I/O to write
1459 if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1460 pgcnt_t pglen = (pgcnt_t) btorp(args.isize);
1463 * This page is past the end of the file. Unlock this page
1464 * (region_trunc will throw it away) and then call
1465 * region_trunc() to invalidate all pages past the new end of
1468 region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1471 * remove the truncation flag.
1473 VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1476 if (VM_NO_PAGEOUT_RUN(&vm_info))
1480 * We have a run of dirty pages [args.start...args.end].
1482 VASSERT(filevp->v_fstype != VCDFS);
1483 VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1484 VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1487 * We will be doing an I/O on the region, let the VM system know.
1489 (void)vm_up_physio_count(&vm_info);
1492 * Okay, get set to perform the I/O.
1496 (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1497 VM_START_PAGEOUT_INDX(&vm_info);
1500 * Allocate and initialize an I/O buffer.
1503 vm_init_bp(&vm_info, bp); /* Let the VM system initialize */
1505 /* Identify this buffer for KI */
1506 bp->b_bptype = B_vfs_pageout | B_pagebf;
1509 bp->b_flags = B_CALL | B_BUSY | B_PAGEOUT; /* steal pages */
1511 bp->b_flags = B_CALL | B_BUSY; /* keep pages */
1514 * If we are vhand paging over NFS, we will wait for the I/O
1517 if (vhand && filevp->v_fstype == VNFS) {
1518 bp->b_flags &= ~B_CALL;
1520 bp->b_iodone = (int (*)())pageiodone;
1524 * Make sure we do not write past the end of the file.
1526 nbytes = ptob(npages);
1527 start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1528 if (start + nbytes > args.isize) {
1531 * The amount we are off better not be bigger than a
1534 if (start + nbytes - args.isize >= args.bsize) {
1535 osi_Panic("afs_pageout: remainder too large");
1539 * Reset the size of the I/O as necessary. For remote
1540 * files, we set the size to the exact number of bytes to
1541 * the end of the file. For local files, we round this up
1542 * to the nearest DEV_BSIZE chunk since disk I/O must always
1543 * be in multiples of DEV_BSIZE. In this case, we do not
1544 * bother to zero out the data past the "real" end of the
1545 * file, this is done when the data is read (either through
1546 * mmap() or by normal file system access).
1549 nbytes = args.isize - start;
1551 nbytes = roundup(args.isize - start, DEV_BSIZE);
1555 * Now get ready to perform the I/O
1557 if (!vm_protect_pageout(&vm_info, npages)) {
1559 vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1560 vm_finish_io_failed(&vm_info, npages);
1565 * If this is an NFS write by vhand(), we will not be calling
1566 * pageiodone(). asyncpageio() increments parolemem for us
1567 * if bp->b_iodone is pageiodone, so we must do it manually
1568 * if pageiodone() will not be called automatically.
1570 if (!(bp->b_flags & B_CALL) && steal) {
1571 register ulong_t context;
1573 SPINLOCK_USAV(pfdat_lock, context);
1574 parolemem += btorp(nbytes);
1575 SPINUNLOCK_USAV(pfdat_lock, context);
1577 blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1578 (BX_NOBUFWAIT | BX_PURGE), VM_REGION(&vm_info));
1581 * If vhand is the one paging things out, and this is an NFS
1582 * file, we need to temporarily become a different user so
1583 * that we are not trying to page over NFS as root. We use
1584 * the user credentials associated with the writable file
1585 * pointer that is in the psuedo-vas for this MMF.
1587 * NOTE: we are currently using "va_rss" to store the ucred
1588 * value in the vas (this should be fixed in 10.0).
1590 old_cred = kt_cred(u.u_kthreadp);
1592 #if defined(AFS_HPUX1123_ENV)
1594 * DEE - 1123 does not have the vas.h, and it looks
1595 * we should never be called with a NFS type file anyway.
1596 * so where did this come from? Was it copied from NFS?
1597 * I assume it was, so we will add an assert for now
1598 * and see if the code runs at all.
1600 VASSERT(filevp->v_fstype != VNFS);
1602 set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1605 * If root was the one who opened the mmf for write,
1606 * va_cred will be NULL. So reset kt_cred(u.u_kthreadp) to what it
1607 * was. We will page out as root, but that is the
1608 * correct thing to do in this case anyway.
1610 if (kt_cred(u.u_kthreadp) == NULL)
1611 set_kt_cred(u.u_kthreadp, old_cred);
1616 * Really do the I/O.
1619 asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1620 VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1621 (int)nbytes, B_WRITE, devvp);
1623 VASSERT(error == 0);
1627 * If we are vhand paging over NFS we want to wait for the
1628 * I/O to complete and take the appropriate actions if an
1629 * error is encountered.
1632 if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1634 * The server is down, ignore this failure, and
1635 * try again later. (rfscall() has set our retry
1638 fsdata.remote_down = 1;
1639 pageiocleanup(bp, 0);
1642 * vm_vfdcheck() has cleared the valid bit on the
1643 * vfds for these pages. We must go back and set the
1644 * valid bit, as the pages are really not gone.
1646 * NOTE: we can do this because we still hold (and have
1647 * not released) the region lock.
1650 vm_undo_invalidation(&vm_info, vm_info.start,
1654 * The I/O succeeded, or we had an error that we do
1655 * not want to defer until later. Call pageidone()
1664 * And restore our credentials to what they were.
1666 set_kt_cred(u.u_kthreadp, old_cred);
1669 * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1670 * can now unreserve it.
1672 if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1673 vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1674 vm_release_malloc_memory();
1681 if (flags & PF_DEACT) {
1682 #if defined(AFS_HPUX110_ENV)
1683 getppdp()->cnt.v_pswpout += npages;
1685 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1687 /* sar_bswapout += ptod(npages);*/
1689 #if defined(AFS_HPUX110_ENV)
1690 getppdp()->cnt.v_pgout++;
1691 getppdp()->cnt.v_pgpgout += npages;
1693 mpproc_info[getprocindex()].cnt.v_pgout++;
1694 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1700 * If time and patience have delivered enough
1701 * pages, then quit now while we are ahead.
1703 if (VM_STOP_PAGING(&vm_info))
1706 i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1709 vm_finish_pageout(&vm_info); /* update vhand's stealscan */
1714 * If we wanted to wait for the I/O to complete, sleep on piocnt.
1715 * We must decrement it by one first, and then make sure that it
1716 * is non-zero before going to sleep.
1718 vm_wait_for_io(&vm_info);
1720 if (inode_changed && !file_is_remote) {
1721 imark(ip, IUPD | ICHG);
1728 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1729 struct vnode *filevp;
1731 kern_daddr_t *bn; /* Block number. */
1732 int flags; /* B_READ or B_WRITE */
1733 int *hole; /* To be used for read-ahead. */
1734 pgcnt_t *startidx; /* To be used for read-ahead. */
1735 pgcnt_t *endidx; /* To be used for read-ahead. */
1737 kern_daddr_t lbn, local_bn;
1740 long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1743 *startidx = (pgcnt_t) (offset / NBPG);
1745 *endidx = (pgcnt_t) (offset / NBPG);
1747 *hole = 0; /* Can't have holes. */
1749 osi_Panic("afs_mapdbd: zero size");
1751 lbn = (kern_daddr_t) (offset / bsize);
1752 on = offset % bsize;
1754 err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1758 * We can never get a bn less than zero on remote files.
1760 VASSERT(local_bn >= 0);
1762 local_bn = local_bn + btodb(on);
1770 * 1: The blocks are contiguous.
1771 * 0: The blocks are not contiguous.
1774 afs_vm_fscontiguous(vp, args, cur_data)
1779 if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1788 * 1: Stop, this page is the last in the block.
1790 * Terminate requests at filesystem block boundaries
1792 afs_vm_stopio(vp, args)
1796 fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1798 #if defined(AFS_HPUX1123_ENV)
1800 tmpdb = VM_END_PAGEOUT_BLK(args);
1802 if ((dbtob(tmpdb) + NBPG) % (fsdata->bsize) == 0)
1804 if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0)
1805 #endif /* AFS_HPUX1123_ENV */
1814 * afs_vm_checkpage is called by the VM while collecting a run of
1815 * pages on a pageout. afs_vm_checkpage() is called for each page
1816 * VM wants to write to disk.
1818 afs_vm_checkpage(vp, args, pgindx, cur_data)
1824 fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1826 if (fsdata->remote_down) { /* never happens for AFS */
1828 * The remote system is down.
1830 VASSERT(args->run == 0);
1834 * A dirty page. If we have not yet determined the file size and
1835 * other attributes that we need to write out pages (the block
1836 * size and ok_dbd_limit), get that information now.
1838 if (fsdata->bsize == 0) {
1842 struct vnode *filevp;
1844 * Get the various attributes about the file. Store them
1845 * in args for the next time around.
1849 bsize = vtoblksz(filevp);
1850 args->maxpgs = (pgcnt_t) btop(bsize);
1852 if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1854 * The VOP_GETATTR() failed.
1855 * we are vhand, and this is a hard mount, we will
1856 * skip dirty pages for a while and try again later.
1858 if (args->vm_flags & PAGEOUT_VHAND) {
1859 VASSERT(args->run == 0);
1863 * This is a "soft" mount, or some other error was
1864 * returned from the server. Mark this region
1865 * as a zombie, and free this dirty page.
1867 VM_ZOMBIE_OBJECT(args);
1870 * The caller will see r_zomb and remove the page
1876 fsdata->isize = isize;
1877 fsdata->bsize = bsize;
1881 * See if the file has shrunk (this could have happened
1882 * asynchronously because of NFS or DUX). If so, invalidate
1883 * all of the pages past the end of the file. This is only
1884 * needed for remote files, as local files are truncated
1888 if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1890 * This page is past the end of the file. Unlock this page
1891 * (region_trunc will throw it away) and then call region_trunc()
1892 * to invalidate all pages past the new end of the file.
1894 VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1898 if ((args->vm_flags & PAGEOUT_VHAND)
1899 && (!(args->vm_flags & PAGEOUT_RESERVED))
1900 && (!(VM_IS_ZOMBIE(args)))) {
1901 VASSERT(args->run == 0);
1902 if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1904 * Got enough memory to pageout. Mark the fact that we did
1905 * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1906 * later (in remote_pageout()).
1908 args->vm_flags |= PAGEOUT_RESERVED;
1911 * We do not have enough memory to do this pageout. By
1912 * definition, we do not yet have a run, so we just unlock
1913 * this page and tell foreach_valid() to continue scanning.
1914 * If we come across another dirty page, we will try to
1915 * reserve memory again. That is okay, in fact some memory
1916 * may have freed up (as earlier pageouts complete under
1933 fs_bsize = vtoblksz(bp->b_vp);
1935 * Check to see if we are starting mid block. If so, then
1936 * we must return the remainder of the block or less depending
1939 bnrem = bp->b_offset % fs_bsize;
1941 max_size = fs_bsize - bnrem;
1943 max_size = fs_bsize;
1946 if (bp->b_bcount > max_size) {
1949 return (bp->b_bcount);
1953 afs_mmap(vp, off, size_bytes, access)
1956 #if defined(AFS_HPUX1111_ENV)
1963 long bsize = vtoblksz(vp);
1965 if (bsize % NBPG != 0) {
1972 afs_cachelimit(vp, len, location)
1978 * Disk addresses are logical, not physical, so fragments are
1981 *location = btorp(len) + 1;
1991 afs_unmap(vp, off, size_bytes, access)
1994 #if defined(AFS_HPUX1111_ENV)
2005 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
2013 printf("afs_read_ahead returning 0 \n");
2018 afs_prealloc(vp, size, ignore_minfree, reserved)
2020 /* DEE on 11.22 following is off_t */
2025 printf("afs_prealloc returning ENOSPC\n");
2030 afs_ioctl(vp, com, data, flag, cred)
2038 struct afs_ioctl afsioctl, *ai;
2040 AFS_STATCNT(afs_ioctl);
2042 /* The call must be a VICEIOCTL call */
2043 if (((com >> 8) & 0xff) == 'V') {
2045 /* AFS_COPYIN returns error 14. Copy data in instead */
2046 AFS_COPYIN(data, (caddr_t) & afsioctl, sizeof(afsioctl), error);
2050 ai = (struct afs_ioctl *)data;
2051 afsioctl.in = ai->in;
2052 afsioctl.out = ai->out;
2053 afsioctl.in_size = ai->in_size;
2054 afsioctl.out_size = ai->out_size;
2055 error = HandleIoctl(VTOAFS(vp), com, &afsioctl);
2061 #if defined(AFS_HPUX1111_ENV)
2062 /* looks like even if appl is 32 bit, we need to round to 8 bytes */
2063 /* This had no effect, it must not be being used */
2065 #define roundtoint(x) (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2066 #define reclen(dp) roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2067 sizeof(u_int) + 2 * sizeof(u_short)))
2070 #define roundtoint(x) (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
2071 #define reclen(dp) roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2072 2 * sizeof(u_short)))
2076 afs_readdir(vp, uiop, cred)
2083 caddr_t ibuf, obuf, ibufend, obufend;
2084 struct __dirent32 *idp;
2086 int count, outcount;
2088 uint64_t tmp_offset;
2090 count = uiop->uio_resid;
2091 /* Allocate temporary space for format conversion */
2092 ibuf = kmem_alloc(2 * count); /* overkill - fix later */
2093 obuf = kmem_alloc(count + sizeof(struct dirent));
2094 aiov.iov_base = ibuf;
2095 aiov.iov_len = count;
2096 auio.uio_iov = &aiov;
2097 auio.uio_iovcnt = 1;
2098 offset = auio.uio_offset = uiop->uio_offset;
2099 auio.uio_seg = UIOSEG_KERNEL;
2100 auio.uio_resid = count;
2101 auio.uio_fpflags = 0;
2103 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2107 /* Convert entries from __dirent32 to dirent format */
2109 for (idp = (struct __dirent32 *)ibuf, odp =
2110 (struct dirent *)obuf, ibufend =
2111 ibuf + (count - auio.uio_resid), obufend = obuf + count;
2112 (caddr_t) idp < ibufend;
2113 idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2114 (struct dirent *)((caddr_t) odp + odp->d_reclen)) {
2115 odp->d_ino = idp->__d_ino;
2116 odp->d_namlen = idp->__d_namlen;
2117 (void)strcpy(odp->d_name, idp->__d_name);
2118 odp->d_reclen = reclen(odp);
2119 if ((caddr_t) odp + odp->d_reclen > obufend)
2121 /* record offset *after* we're sure to use this entry */
2122 memcpy((char *)&tmp_offset, (char *)&idp->__d_off, sizeof tmp_offset);
2123 offset = tmp_offset;
2126 outcount = (caddr_t) odp - obuf;
2127 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2130 uiop->uio_offset = offset;
2132 kmem_free(ibuf, count);
2133 kmem_free(obuf, count + sizeof(struct dirent));
2138 #define roundtolong(x) (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2139 #define reclen_dirent64(dp) roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2140 2 * sizeof(u_short)))
2143 afs_readdir3(vp, uiop, cred)
2150 caddr_t ibuf, obuf, ibufend, obufend;
2151 struct __dirent32 *idp;
2152 struct __dirent64 *odp;
2153 int count, outcount;
2156 count = uiop->uio_resid;
2157 /* Allocate temporary space for format conversion */
2158 ibuf = kmem_alloc(2 * count); /* overkill - fix later */
2159 obuf = kmem_alloc(count + sizeof(struct __dirent64));
2160 aiov.iov_base = ibuf;
2161 aiov.iov_len = count;
2162 auio.uio_iov = &aiov;
2163 auio.uio_iovcnt = 1;
2164 offset = auio.uio_offset = uiop->uio_offset;
2165 auio.uio_seg = UIOSEG_KERNEL;
2166 auio.uio_resid = count;
2167 auio.uio_fpflags = 0;
2169 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2173 /* Convert entries from __dirent32 to __dirent64 format */
2175 for (idp = (struct __dirent32 *)ibuf, odp =
2176 (struct __dirent64 *)obuf, ibufend =
2177 ibuf + (count - auio.uio_resid), obufend = obuf + count;
2178 (caddr_t) idp < ibufend;
2179 idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2180 (struct __dirent64 *)((caddr_t) odp + odp->__d_reclen)) {
2181 memcpy((char *)&odp->__d_off, (char *)&idp->__d_off,
2182 sizeof odp->__d_off);
2183 odp->__d_ino = idp->__d_ino;
2184 odp->__d_namlen = idp->__d_namlen;
2185 (void)strcpy(odp->__d_name, idp->__d_name);
2186 odp->__d_reclen = reclen_dirent64(odp);
2187 if ((caddr_t) odp + odp->__d_reclen > obufend)
2189 /* record offset *after* we're sure to use this entry */
2190 offset = odp->__d_off;
2193 outcount = (caddr_t) odp - obuf;
2194 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2197 uiop->uio_offset = offset;
2199 kmem_free(ibuf, count);
2200 kmem_free(obuf, count + sizeof(struct __dirent64));
2204 #define AFS_SV_SEMA_HASH 1
2205 #define AFS_SV_SEMA_HASH_DEBUG 0
2207 #if AFS_SV_SEMA_HASH
2208 /* This portion of the code was originally used to implement
2209 * thread specific storage for the semaphore save area. However,
2210 * there were some spare fields in the proc structure, this is
2211 * now being used for the saving semapores. Hence, this portion of
2212 * the code is no longer used.
2215 /* This portion of the code implements thread specific information.
2216 * The thread id is passed in as the key. The semaphore saved area
2217 * is hashed on this key.
2220 /* why is this hash table required ?
2221 * The AFS code is written in such a way that a GLOCK() is done in
2222 * one function and the GUNLOCK() is done in another function further
2223 * down the call chain. The GLOCK() call has to save the current
2224 * semaphore status before acquiring afs_global_sema. The GUNLOCK
2225 * has to release afs_global_sema and reacquire the sempahore status
2226 * that existed before the corresponding GLOCK. If GLOCK() and
2227 * GUNLOCK() were called in the same function, the GLOCK call could
2228 * have stored the saved sempahore status in a local variable and the
2229 * corresponding GUNLOCK() call could have restored the original
2230 * status from this local variable. But this is not the case with
2231 * AFS code. Hence, we have to implement a thread specific semaphore
2232 * save area. This is implemented as a hash table. The key is the
2236 /* In order for multithreaded processes to work, the sv_sema structures
2237 * must be saved on a per-thread basis, not a per-process basis. There
2238 * is no per-thread storage available to hijack in the OS per-thread
2239 * data structures (e.g. struct user) so we revive this code.
2240 * I removed the upper limit on the memory consumption since we don't
2241 * know how many threads there will be. Now the code first checks the
2242 * freeList. If that fails it then tries garbage collecting. If that
2243 * doesn't free up anything then it allocs what it needs.
2246 #define ELEMENT sv_sema_t
2248 #define Hash(xx) ( (xx) % sizeOfHashTable )
2249 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2250 #define hashLock(xx) MP_PSEMA(&xx)
2251 #define hashUnlock(xx) MP_VSEMA(&xx)
2253 typedef struct elem {
2260 typedef struct bucket {
2265 static int sizeOfHashTable;
2266 static Bucket *hashTable;
2268 static int currentSize = 0;
2269 static Element *freeList; /* free list */
2272 static sema_t afsHashLock = { 0 }; /* global lock for hash table */
2274 static void afsHashGarbageCollect();
2277 ** The global lock protects the global data structures,
2278 ** e.g. freeList and currentSize.
2279 ** The bucket lock protects the link list hanging off that bucket.
2280 ** The lock hierarchy : one can obtain the bucket lock while holding
2281 ** the global lock, but not vice versa.
2286 afsHash(int nbuckets)
2287 { /* allocate the hash table */
2290 #if AFS_SV_SEMA_HASH_DEBUG
2291 printf("afsHash: enter\n");
2294 sizeOfHashTable = nbuckets;
2295 currentSize = nbuckets * sizeof(Bucket);
2298 osi_Panic("afs: SEMA Hashtable already created\n");
2300 hashTable = (Bucket *) AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2302 osi_Panic("afs: cannot create SEMA Hashtable\n");
2304 /* initialize the hash table and associated locks */
2305 memset((char *)hashTable, 0, sizeOfHashTable * sizeof(Bucket));
2306 for (i = 0; i < sizeOfHashTable; i++)
2307 hashLockInit(hashTable[i].lock);
2308 hashLockInit(afsHashLock);
2310 #if AFS_SV_SEMA_HASH_DEBUG
2311 printf("afsHash: exit\n");
2316 afsHashInsertFind(KEY key)
2321 #if AFS_SV_SEMA_HASH_DEBUG
2322 printf("afsHashInsertFind: %d\n", key);
2325 osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2327 index = Hash(key); /* get bucket number */
2328 hashLock(hashTable[index].lock); /* lock this bucket */
2329 ptr = hashTable[index].element;
2331 /* if it is already there */
2333 if (ptr->key == key) {
2334 ptr->refCnt++; /* hold it */
2335 hashUnlock(hashTable[index].lock);
2336 #if AFS_SV_SEMA_HASH_DEBUG
2337 printf("afsHashInsertFind: %d FOUND\n", key);
2339 return &(ptr->element);
2345 hashUnlock(hashTable[index].lock);
2347 /* if something exists in the freeList, take it from there */
2349 hashLock(afsHashLock);
2352 ptr = freeList; /* reuse entry */
2353 freeList = freeList->next;
2355 afsHashGarbageCollect(); /* afsHashLock locked */
2357 ptr = freeList; /* reuse entry */
2358 freeList = freeList->next;
2360 ptr = (Element *) AFS_KALLOC(sizeof(Element));
2364 currentSize += sizeof(Element); /* update memory used */
2365 hashUnlock(afsHashLock);
2368 osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2369 /* create new entry */
2371 memset((char *)&ptr->element, 0, sizeof(ptr->element));
2372 ptr->refCnt = 1; /* this guy */
2374 /* insert new entry in bucket */
2375 hashLock(hashTable[index].lock); /* lock this bucket */
2376 ptr->next = hashTable[index].element;
2377 hashTable[index].element = ptr;
2378 hashUnlock(hashTable[index].lock);
2380 #if AFS_SV_SEMA_HASH_DEBUG
2381 printf("afsHashInsertFind: %d MADE\n", key);
2384 return &(ptr->element);
2388 afsHashFind(KEY key)
2393 #if AFS_SV_SEMA_HASH_DEBUG
2394 printf("afsHashFind: %d\n", key);
2397 osi_Panic("afs: afsHashFind: no hashTable\n");
2399 index = Hash(key); /* get bucket number */
2400 hashLock(hashTable[index].lock); /* lock this bucket */
2401 ptr = hashTable[index].element;
2403 /* it should be in the hash table */
2405 if (ptr->key == key) {
2406 if (ptr->refCnt <= 0)
2407 osi_Panic("afs: SEMA HashTable entry already released\n");
2408 hashUnlock(hashTable[index].lock);
2409 #if AFS_SV_SEMA_HASH_DEBUG
2410 printf("afsHashFind: %d FOUND\n", key);
2412 return &(ptr->element);
2418 hashUnlock(hashTable[index].lock);
2419 /* it better be in the hash table */
2420 osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2425 afsHashRelease(KEY key)
2430 #if AFS_SV_SEMA_HASH_DEBUG
2431 printf("afsHashRelease: %d\n", key);
2434 osi_Panic("afs: afsHashRelease: no hashTable\n");
2436 index = Hash(key); /* get bucket number */
2437 hashLock(hashTable[index].lock); /* lock this bucket */
2438 ptr = hashTable[index].element;
2440 /* it should be in the hash table */
2442 if (ptr->key == key) {
2443 if (ptr->refCnt <= 0)
2444 osi_Panic("afs: SEMA HashTable entry already released\n");
2445 ptr->refCnt--; /* release this guy */
2446 hashUnlock(hashTable[index].lock);
2447 #if AFS_SV_SEMA_HASH_DEBUG
2448 printf("afsHashRelease: %d FOUND\n", key);
2456 hashUnlock(hashTable[index].lock);
2457 /* it better be in the hash table */
2458 osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2461 /* this should be called with afsHashLock WRITE locked */
2463 afsHashGarbageCollect()
2470 osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2472 for (index = 0; index < sizeOfHashTable; index++) {
2473 hashLock(hashTable[index].lock);
2474 ptr = hashTable[index].element; /* pick up bucket */
2476 while (ptr && !ptr->refCnt) {
2477 /* insert this element into free list */
2480 ptr->next = freeList;
2483 foundFlag = 1; /* found at least one */
2484 currentSize -= sizeof(Element);
2487 hashTable[index].element = ptr;
2489 /* scan thru the remaining list */
2492 if (ptr->next->refCnt == 0) {
2493 /* collect this element */
2496 ptr->next = ptr->next->next;
2497 temp->next = freeList;
2500 currentSize -= sizeof(Element);
2506 hashUnlock(hashTable[index].lock);
2510 osi_Panic("afs: SEMA HashTable full\n");
2514 #endif /* AFS_SV_SEMA_HASH */
2518 register struct buf *bp;
2520 register afs_int32 code;
2522 struct iovec tiovec[1];
2523 extern caddr_t hdl_kmap_bp();
2524 register struct kthread *t = u.u_kthreadp;
2526 AFS_STATCNT(afs_hp_strategy);
2528 * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2529 * the I/O. We must save and restore the count because pageiodone()
2530 * uses b_bcount to determine how many pages to unlock.
2532 * Remap the entire range.
2537 afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER, bp->b_vp,
2538 ICL_TYPE_LONG, (int)bp->b_blkno * DEV_BSIZE, ICL_TYPE_LONG,
2539 bp->b_bcount, ICL_TYPE_LONG, 0);
2541 /* Set up the uio structure */
2542 tuio.afsio_iov = tiovec;
2543 tuio.afsio_iovcnt = 1;
2544 tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2545 tuio.afsio_seg = AFS_UIOSYS;
2546 tuio.afsio_resid = bp->b_bcount;
2547 tuio.uio_fpflags = 0;
2548 tiovec[0].iov_base = bp->b_un.b_addr;
2549 tiovec[0].iov_len = bp->b_bcount;
2552 if ((bp->b_flags & B_READ) == B_READ) {
2553 /* read b_bcount bytes into kernel address b_un.b_addr
2554 * starting at byte DEV_BSIZE * b_blkno. Bzero anything
2555 * we can't read, and finally call iodone(bp). File is
2556 * in bp->b_vp. Credentials are from u area??
2558 code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_READ, 0, kt_cred(t));
2560 if (tuio.afsio_resid > 0) {
2561 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2562 bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2563 (size_t) tuio.afsio_resid);
2567 code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_WRITE, 0, kt_cred(t));
2569 /* Remap back to the user's space */
2578 afs_pathconf(vp, name, resultp, cred)
2582 struct ucred *cred; /* unused */
2585 case _PC_LINK_MAX: /* Maximum number of links to a file */
2586 *resultp = 255; /* an unsigned short on the fileserver */
2587 break; /* a unsigned char in the client.... */
2589 case _PC_NAME_MAX: /* Max length of file name */
2593 case _PC_PATH_MAX: /* Maximum length of Path Name */
2597 case _PC_PIPE_BUF: /* Max atomic write to pipe. See fifo_vnops */
2598 case _PC_CHOWN_RESTRICTED: /* Anybody can chown? */
2599 case _PC_NO_TRUNC: /* No file name truncation on overflow? */
2600 u.u_error = EOPNOTSUPP;
2601 return (EOPNOTSUPP);
2604 case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2605 /* need more work here for pty, ite buffer size, if differ */
2606 if (vp->v_type != VCHR) {
2610 *resultp = CANBSIZ; /*for tty */
2614 /* need more work here for pty, ite buffer size, if differ */
2615 if (vp->v_type != VCHR) { /* TTY buffer size */
2619 *resultp = TTYHOG; /*for tty */
2623 /* Terminal special characters can be disabled? */
2624 if (vp->v_type != VCHR) {
2632 if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2636 *resultp = 1; /* Synchronized IO supported for this file */
2639 case _PC_FILESIZEBITS:
2640 if (vp->v_type != VDIR)
2642 *resultp = MAX_SMALL_FILE_BITS;