summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorroot <root>2008-10-27 11:08:29 +0000
committerroot <root>2008-10-27 11:08:29 +0000
commited3f7d82632357b7ba925cb742cd9ef61ceed26d (patch)
tree1c03ccb4ac30a7c147b4b5be39f99b5fe1325e14
parent591339236dd05d5184c977fea78de5f5d82e70e1 (diff)
work around epoll spurious notifications
-rw-r--r--Changes2
-rw-r--r--ev.c2
-rw-r--r--ev.pod11
-rw-r--r--ev_epoll.c20
4 files changed, 26 insertions, 9 deletions
diff --git a/Changes b/Changes
index 72824fc..24be5db 100644
--- a/Changes
+++ b/Changes
@@ -5,6 +5,8 @@ WISH? monotonic clocks times/GetTickCount for coarse corrections?
- further optimise away the EPOLL_CTL_ADD/MOD combo in the epoll
backend by assuming the kernel event mask hasn't changed if
ADD fails with EEXIST.
+ - work around spurious event notification bugs in epoll by using
+ an 8-bit generation counter.
- use memset to initialise most arrays now and do away with the
init functions.
- expand time-out strategies into a "Be smart about timeouts" section.
diff --git a/ev.c b/ev.c
index bcb49b5..66a70ec 100644
--- a/ev.c
+++ b/ev.c
@@ -452,7 +452,7 @@ typedef struct
unsigned char events;
unsigned char reify;
unsigned char emask; /* the epoll backend stores the actual kernel mask in here */
- unsigned char unused; /* currently unused padding */
+ unsigned char egen; /* generation counter to counter epoll bugs */
#if EV_SELECT_IS_WINSOCKET
SOCKET handle;
#endif
diff --git a/ev.pod b/ev.pod
index b0cfaf5..b4e416f 100644
--- a/ev.pod
+++ b/ev.pod
@@ -391,16 +391,19 @@ of shortcomings, such as silently dropping events in some hard-to-detect
cases and requiring a system call per fd change, no fork support and bad
support for dup.
+Epoll is also notoriously buggy - embedding epoll fds should work, but
+of course doesn't, and epoll just loves to report events for totally
+I<different> file descriptors (even already closed ones) than registered
+in the set (especially on SMP systems). Libev tries to counter these
+spurious notifications by employing an additional generation counter and
+comparing that against the events to filter out spurious ones.
+
While stopping, setting and starting an I/O watcher in the same iteration
will result in some caching, there is still a system call per such incident
(because the fd could point to a different file description now), so its
best to avoid that. Also, C<dup ()>'ed file descriptors might not work
very well if you register events for both fds.
-Please note that epoll sometimes generates spurious notifications, so you
-need to use non-blocking I/O or other means to avoid blocking when no data
-(or space) is available.
-
Best performance from this backend is achieved by not unregistering all
watchers for a file descriptor until it has been closed, if possible,
i.e. keep at least one watcher active per fd at all times. Stopping and
diff --git a/ev_epoll.c b/ev_epoll.c
index e36dbcc..4c36cfa 100644
--- a/ev_epoll.c
+++ b/ev_epoll.c
@@ -85,7 +85,8 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
oldmask = anfds [fd].emask;
anfds [fd].emask = nev;
- ev.data.u64 = fd; /* use u64 to fully initialise the struct, for nicer strace etc. */
+ /* store the generation counter in the upper 32 bits */
+ ev.data.u64 = fd | ((uint64_t)++anfds [fd].egen << 32);
ev.events = (nev & EV_READ ? EPOLLIN : 0)
| (nev & EV_WRITE ? EPOLLOUT : 0);
@@ -96,7 +97,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
{
/* if ENOENT then the fd went away, so try to do the right thing */
if (!nev)
- return;
+ goto dec_egen;
if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev))
return;
@@ -105,11 +106,18 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
{
/* EEXIST means we ignored a previous DEL, but the fd is still active */
/* if the kernel mask is the same as the new mask, we assume it hasn't changed */
- if (oldmask == nev || !epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
+ if (oldmask == nev)
+ goto dec_egen;
+
+ if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
return;
}
fd_kill (EV_A_ fd);
+
+dec_egen:
+ /* we didn't successfully call epoll_ctl, so decrement the generation counter again */
+ --anfds [fd].egen;
}
static void
@@ -130,11 +138,15 @@ epoll_poll (EV_P_ ev_tstamp timeout)
{
struct epoll_event *ev = epoll_events + i;
- int fd = ev->data.u64;
+ int fd = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */
int got = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0)
| (ev->events & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0);
int want = anfds [fd].events;
+ if (anfds [fd].egen != (unsigned char)(ev->data.u64 >> 32))
+ /*fprintf (stderr, "spurious notification fd %d, %d vs %d\n", fd, (int)(ev->data.u64 >> 32), anfds [fd].egen);*/
+ continue;
+
if (expect_false (got & ~want))
{
anfds [fd].emask = want;