From ed3f7d82632357b7ba925cb742cd9ef61ceed26d Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Oct 2008 11:08:29 +0000 Subject: work around epoll spurious notifications --- Changes | 2 ++ ev.c | 2 +- ev.pod | 11 +++++++---- ev_epoll.c | 20 ++++++++++++++++---- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Changes b/Changes index 72824fc..24be5db 100644 --- a/Changes +++ b/Changes @@ -5,6 +5,8 @@ WISH? monotonic clocks times/GetTickCount for coarse corrections? - further optimise away the EPOLL_CTL_ADD/MOD combo in the epoll backend by assuming the kernel event mask hasn't changed if ADD fails with EEXIST. + - work around spurious event notification bugs in epoll by using + an 8-bit generation counter. - use memset to initialise most arrays now and do away with the init functions. - expand time-out strategies into a "Be smart about timeouts" section. diff --git a/ev.c b/ev.c index bcb49b5..66a70ec 100644 --- a/ev.c +++ b/ev.c @@ -452,7 +452,7 @@ typedef struct unsigned char events; unsigned char reify; unsigned char emask; /* the epoll backend stores the actual kernel mask in here */ - unsigned char unused; /* currently unused padding */ + unsigned char egen; /* generation counter to counter epoll bugs */ #if EV_SELECT_IS_WINSOCKET SOCKET handle; #endif diff --git a/ev.pod b/ev.pod index b0cfaf5..b4e416f 100644 --- a/ev.pod +++ b/ev.pod @@ -391,16 +391,19 @@ of shortcomings, such as silently dropping events in some hard-to-detect cases and requiring a system call per fd change, no fork support and bad support for dup. +Epoll is also notoriously buggy - embedding epoll fds should work, but +of course doesn't, and epoll just loves to report events for totally +I file descriptors (even already closed ones) than registered +in the set (especially on SMP systems). Libev tries to counter these +spurious notifications by employing an additional generation counter and +comparing that against the events to filter out spurious ones. + While stopping, setting and starting an I/O watcher in the same iteration will result in some caching, there is still a system call per such incident (because the fd could point to a different file description now), so its best to avoid that. Also, C'ed file descriptors might not work very well if you register events for both fds. -Please note that epoll sometimes generates spurious notifications, so you -need to use non-blocking I/O or other means to avoid blocking when no data -(or space) is available. - Best performance from this backend is achieved by not unregistering all watchers for a file descriptor until it has been closed, if possible, i.e. keep at least one watcher active per fd at all times. Stopping and diff --git a/ev_epoll.c b/ev_epoll.c index e36dbcc..4c36cfa 100644 --- a/ev_epoll.c +++ b/ev_epoll.c @@ -85,7 +85,8 @@ epoll_modify (EV_P_ int fd, int oev, int nev) oldmask = anfds [fd].emask; anfds [fd].emask = nev; - ev.data.u64 = fd; /* use u64 to fully initialise the struct, for nicer strace etc. */ + /* store the generation counter in the upper 32 bits */ + ev.data.u64 = fd | ((uint64_t)++anfds [fd].egen << 32); ev.events = (nev & EV_READ ? EPOLLIN : 0) | (nev & EV_WRITE ? EPOLLOUT : 0); @@ -96,7 +97,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev) { /* if ENOENT then the fd went away, so try to do the right thing */ if (!nev) - return; + goto dec_egen; if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev)) return; @@ -105,11 +106,18 @@ epoll_modify (EV_P_ int fd, int oev, int nev) { /* EEXIST means we ignored a previous DEL, but the fd is still active */ /* if the kernel mask is the same as the new mask, we assume it hasn't changed */ - if (oldmask == nev || !epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev)) + if (oldmask == nev) + goto dec_egen; + + if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev)) return; } fd_kill (EV_A_ fd); + +dec_egen: + /* we didn't successfully call epoll_ctl, so decrement the generation counter again */ + --anfds [fd].egen; } static void @@ -130,11 +138,15 @@ epoll_poll (EV_P_ ev_tstamp timeout) { struct epoll_event *ev = epoll_events + i; - int fd = ev->data.u64; + int fd = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */ int got = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0) | (ev->events & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0); int want = anfds [fd].events; + if (anfds [fd].egen != (unsigned char)(ev->data.u64 >> 32)) + /*fprintf (stderr, "spurious notification fd %d, %d vs %d\n", fd, (int)(ev->data.u64 >> 32), anfds [fd].egen);*/ + continue; + if (expect_false (got & ~want)) { anfds [fd].emask = want; -- cgit v1.2.3