2 * Copyright (c) 2011 Your File System Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR
14 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
15 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
17 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
18 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
19 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
20 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
22 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 /* A reimplementation of the rx_event handler using red/black trees
27 * The first rx_event implementation used a simple sorted queue of all
28 * events, which lead to O(n^2) performance, where n is the number of
29 * outstanding events. This was found to scale poorly, so was replaced.
31 * The second implementation used a set of per-second buckets to store
32 * events. Each bucket (referred to as an epoch in the code) stored all
33 * of the events which expired in that second. However, on modern networks
34 * where RTT times are in the millisecond, most connections will have events
35 * expiring within the next second, so the problem reoccurs.
37 * This new implementation uses Red-Black trees to store a sorted list of
38 * events. Red Black trees are guaranteed to have no worse than O(log N)
39 * insertion, and are commonly used in timer applications
42 #include <afsconfig.h>
43 #include <afs/param.h>
46 # include "afs/sysincludes.h"
47 # include "afsincludes.h"
53 #include <opr/queue.h>
54 #include <opr/rbtree.h>
57 #include "rx_atomic.h"
61 struct opr_rbtree_node node;
62 struct clock eventTime;
66 void (*func)(struct rxevent *, void *, void *, int);
75 struct malloclist *next;
80 struct opr_queue list;
81 struct malloclist *mallocs;
86 struct opr_rbtree head;
87 struct rxevent *first;
98 static int allocUnit = 10;
100 static struct rxevent *
101 rxevent_alloc(void) {
102 struct rxevent *evlist;
104 struct malloclist *mrec;
107 MUTEX_ENTER(&freeEvents.lock);
108 if (opr_queue_IsEmpty(&freeEvents.list)) {
109 MUTEX_EXIT(&freeEvents.lock);
111 #if defined(AFS_AIX32_ENV) && defined(KERNEL)
112 ev = rxi_Alloc(sizeof(struct rxevent));
114 evlist = osi_Alloc(sizeof(struct rxevent) * allocUnit);
115 mrec = osi_Alloc(sizeof(struct malloclist));
118 mrec->size = sizeof(struct rxevent) * allocUnit;
120 MUTEX_ENTER(&freeEvents.lock);
121 for (i = 1; i < allocUnit; i++) {
122 opr_queue_Append(&freeEvents.list, &evlist[i].q);
124 mrec->next = freeEvents.mallocs;
125 freeEvents.mallocs = mrec;
126 MUTEX_EXIT(&freeEvents.lock);
130 ev = opr_queue_First(&freeEvents.list, struct rxevent, q);
131 opr_queue_Remove(&ev->q);
132 MUTEX_EXIT(&freeEvents.lock);
135 memset(ev, 0, sizeof(struct rxevent));
136 rx_atomic_set(&ev->refcnt, 1);
142 rxevent_free(struct rxevent *ev) {
143 MUTEX_ENTER(&freeEvents.lock);
144 opr_queue_Prepend(&freeEvents.list, &ev->q);
145 MUTEX_EXIT(&freeEvents.lock);
149 rxevent_put(struct rxevent *ev) {
150 if (rx_atomic_dec_and_read(&ev->refcnt) == 0) {
156 rxevent_Put(struct rxevent *ev) {
160 static_inline struct rxevent *
161 rxevent_get(struct rxevent *ev) {
162 rx_atomic_inc(&ev->refcnt);
167 rxevent_Get(struct rxevent *ev) {
168 return rxevent_get(ev);
171 /* Called if the time now is older than the last time we recorded running an
172 * event. This test catches machines where the system time has been set
173 * backwards, and avoids RX completely stalling when timers fail to fire.
175 * Take the different between now and the last event time, and subtract that
176 * from the timing of every event on the system. This does a relatively slow
177 * walk of the completely eventTree, but time-travel will hopefully be a pretty
180 * This can only safely be called from the event thread, as it plays with the
187 struct opr_rbtree_node *node;
188 struct clock adjTime, now;
190 MUTEX_ENTER(&eventTree.lock);
191 /* Time adjustment is expensive, make absolutely certain that we have
192 * to do it, by getting an up to date time to base our decision on
193 * once we've acquired the relevant locks.
196 if (!clock_Lt(&now, &eventSchedule.last))
199 adjTime = eventSchedule.last;
200 clock_Zero(&eventSchedule.last);
202 clock_Sub(&adjTime, &now);
204 node = opr_rbtree_first(&eventTree.head);
206 struct rxevent *event = opr_containerof(node, struct rxevent, node);
208 clock_Sub(&event->eventTime, &adjTime);
209 node = opr_rbtree_next(node);
211 eventSchedule.next = eventTree.first->eventTime;
214 MUTEX_EXIT(&eventTree.lock);
217 static int initialised = 0;
219 rxevent_Init(int nEvents, void (*scheduler)(void))
227 MUTEX_INIT(&eventTree.lock, "event tree lock", MUTEX_DEFAULT, 0);
228 opr_rbtree_init(&eventTree.head);
230 MUTEX_INIT(&freeEvents.lock, "free events lock", MUTEX_DEFAULT, 0);
231 opr_queue_Init(&freeEvents.list);
232 freeEvents.mallocs = NULL;
237 clock_Zero(&eventSchedule.next);
238 clock_Zero(&eventSchedule.last);
239 eventSchedule.raised = 0;
240 eventSchedule.func = scheduler;
244 rxevent_Post(struct clock *when, struct clock *now,
245 void (*func) (struct rxevent *, void *, void *, int),
246 void *arg, void *arg1, int arg2)
248 struct rxevent *ev, *event;
249 struct opr_rbtree_node **childptr, *parent = NULL;
251 ev = rxevent_alloc();
252 ev->eventTime = *when;
258 if (clock_Lt(now, &eventSchedule.last))
261 MUTEX_ENTER(&eventTree.lock);
263 /* Work out where in the tree we'll be storing this */
264 childptr = &eventTree.head.root;
267 event = opr_containerof((*childptr), struct rxevent, node);
270 if (clock_Lt(when, &event->eventTime))
271 childptr = &(*childptr)->left;
272 else if (clock_Gt(when, &event->eventTime))
273 childptr = &(*childptr)->right;
275 opr_queue_Append(&event->q, &ev->q);
279 opr_queue_Init(&ev->q);
280 opr_rbtree_insert(&eventTree.head, parent, childptr, &ev->node);
282 if (eventTree.first == NULL ||
283 clock_Lt(when, &(eventTree.first->eventTime))) {
284 eventTree.first = ev;
285 eventSchedule.raised = 1;
286 clock_Zero(&eventSchedule.next);
287 MUTEX_EXIT(&eventTree.lock);
288 if (eventSchedule.func != NULL)
289 (*eventSchedule.func)();
290 return rxevent_get(ev);
294 MUTEX_EXIT(&eventTree.lock);
295 return rxevent_get(ev);
298 /* We're going to remove ev from the tree, so set the first pointer to the
299 * next event after it */
301 resetFirst(struct rxevent *ev)
303 struct opr_rbtree_node *next = opr_rbtree_next(&ev->node);
305 eventTree.first = opr_containerof(next, struct rxevent, node);
307 eventTree.first = NULL;
311 rxevent_Cancel(struct rxevent **evp, struct rx_call *call, int type)
313 struct rxevent *event;
320 MUTEX_ENTER(&eventTree.lock);
322 if (!event->handled) {
323 /* We're a node on the red/black tree. If our list is non-empty,
324 * then swap the first element in the list in in our place,
325 * promoting it to the list head */
326 if (event->node.parent == NULL
327 && eventTree.head.root != &event->node) {
328 /* Not in the rbtree, therefore must be a list element */
329 opr_queue_Remove(&event->q);
331 if (!opr_queue_IsEmpty(&event->q)) {
332 struct rxevent *next;
334 next = opr_queue_First(&event->q, struct rxevent, q);
335 opr_queue_Remove(&next->q); /* Remove ourselves from list */
336 if (event->q.prev == &event->q) {
337 next->q.prev = next->q.next = &next->q;
340 next->q.prev->next = &next->q;
341 next->q.next->prev = &next->q;
344 opr_rbtree_replace(&eventTree.head, &event->node,
347 if (eventTree.first == event)
348 eventTree.first = next;
351 if (eventTree.first == event)
354 opr_rbtree_remove(&eventTree.head, &event->node);
358 rxevent_put(event); /* Dispose of eventTree reference */
361 MUTEX_EXIT(&eventTree.lock);
364 rxevent_put(event); /* Dispose of caller's reference */
367 CALL_RELE(call, type);
370 /* Process all events which have expired. If events remain, then the relative
371 * time until the next event is returned in the parameter 'wait', and the
372 * function returns 1. If no events currently remain, the function returns 0
374 * If the current time is older than that of the last event processed, then we
375 * assume that time has gone backwards (for example, due to a system time reset)
376 * When this happens, all events in the current queue are rescheduled, using
377 * the difference between the current time and the last event time as a delta
381 rxevent_RaiseEvents(struct clock *wait)
384 struct rxevent *event;
389 /* Check for time going backwards */
390 if (clock_Lt(&now, &eventSchedule.last))
392 eventSchedule.last = now;
394 MUTEX_ENTER(&eventTree.lock);
395 /* Lock our event tree */
396 while (eventTree.first != NULL
397 && clock_Lt(&eventTree.first->eventTime, &now)) {
399 /* Grab the next node, either in the event's list, or in the tree node
400 * itself, and remove it from the event tree */
401 event = eventTree.first;
402 if (!opr_queue_IsEmpty(&event->q)) {
403 event = opr_queue_Last(&event->q, struct rxevent, q);
404 opr_queue_Remove(&event->q);
407 opr_rbtree_remove(&eventTree.head, &event->node);
410 MUTEX_EXIT(&eventTree.lock);
412 /* Fire the event, then free the structure */
413 event->func(event, event->arg, event->arg1, event->arg2);
416 MUTEX_ENTER(&eventTree.lock);
419 /* Figure out when we next need to be scheduled */
420 if (eventTree.first != NULL) {
421 *wait = eventSchedule.next = eventTree.first->eventTime;
422 ret = eventSchedule.raised = 1;
423 clock_Sub(wait, &now);
425 ret = eventSchedule.raised = 0;
428 MUTEX_EXIT(&eventTree.lock);
434 shutdown_rxevent(void)
436 struct malloclist *mrec, *nmrec;
441 MUTEX_DESTROY(&eventTree.lock);
443 #if !defined(AFS_AIX32_ENV) || !defined(KERNEL)
444 MUTEX_DESTROY(&freeEvents.lock);
445 mrec = freeEvents.mallocs;
448 osi_Free(mrec->mem, mrec->size);
449 osi_Free(mrec, sizeof(struct malloclist));