From ed3f7d82632357b7ba925cb742cd9ef61ceed26d Mon Sep 17 00:00:00 2001
From: root <root>
Date: Mon, 27 Oct 2008 11:08:29 +0000
Subject: work around epoll spurious notifications

---
 Changes    |  2 ++
 ev.c       |  2 +-
 ev.pod     | 11 +++++++----
 ev_epoll.c | 20 ++++++++++++++++----
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/Changes b/Changes
index 72824fc..24be5db 100644
--- a/Changes
+++ b/Changes
@@ -5,6 +5,8 @@ WISH? monotonic clocks times/GetTickCount for coarse corrections?
 	- further optimise away the EPOLL_CTL_ADD/MOD combo in the epoll
           backend by assuming the kernel event mask hasn't changed if
           ADD fails with EEXIST.
+        - work around spurious event notification bugs in epoll by using
+          an 8-bit generation counter.
         - use memset to initialise most arrays now and do away with the
           init functions.
         - expand time-out strategies into a "Be smart about timeouts" section.
diff --git a/ev.c b/ev.c
index bcb49b5..66a70ec 100644
--- a/ev.c
+++ b/ev.c
@@ -452,7 +452,7 @@ typedef struct
   unsigned char events;
   unsigned char reify;
   unsigned char emask; /* the epoll backend stores the actual kernel mask in here */
-  unsigned char unused; /* currently unused padding */
+  unsigned char egen;  /* generation counter to counter epoll bugs */
 #if EV_SELECT_IS_WINSOCKET
   SOCKET handle;
 #endif
diff --git a/ev.pod b/ev.pod
index b0cfaf5..b4e416f 100644
--- a/ev.pod
+++ b/ev.pod
@@ -391,16 +391,19 @@ of shortcomings, such as silently dropping events in some hard-to-detect
 cases and requiring a system call per fd change, no fork support and bad
 support for dup.
 
+Epoll is also notoriously buggy - embedding epoll fds should work, but
+of course doesn't, and epoll just loves to report events for totally
+I<different> file descriptors (even already closed ones) than registered
+in the set (especially on SMP systems). Libev tries to counter these
+spurious notifications by employing an additional generation counter and
+comparing that against the events to filter out spurious ones.
+
 While stopping, setting and starting an I/O watcher in the same iteration
 will result in some caching, there is still a system call per such incident
 (because the fd could point to a different file description now), so its
 best to avoid that. Also, C<dup ()>'ed file descriptors might not work
 very well if you register events for both fds.
 
-Please note that epoll sometimes generates spurious notifications, so you
-need to use non-blocking I/O or other means to avoid blocking when no data
-(or space) is available.
-
 Best performance from this backend is achieved by not unregistering all
 watchers for a file descriptor until it has been closed, if possible,
 i.e. keep at least one watcher active per fd at all times. Stopping and
diff --git a/ev_epoll.c b/ev_epoll.c
index e36dbcc..4c36cfa 100644
--- a/ev_epoll.c
+++ b/ev_epoll.c
@@ -85,7 +85,8 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
   oldmask = anfds [fd].emask;
   anfds [fd].emask = nev;
 
-  ev.data.u64 = fd; /* use u64 to fully initialise the struct, for nicer strace etc. */
+  /* store the generation counter in the upper 32 bits */
+  ev.data.u64 = fd | ((uint64_t)++anfds [fd].egen << 32);
   ev.events   = (nev & EV_READ  ? EPOLLIN  : 0)
               | (nev & EV_WRITE ? EPOLLOUT : 0);
 
@@ -96,7 +97,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
     {
       /* if ENOENT then the fd went away, so try to do the right thing */
       if (!nev)
-        return;
+        goto dec_egen;
 
       if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev))
         return;
@@ -105,11 +106,18 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
     {
       /* EEXIST means we ignored a previous DEL, but the fd is still active */
       /* if the kernel mask is the same as the new mask, we assume it hasn't changed */
-      if (oldmask == nev || !epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
+      if (oldmask == nev)
+        goto dec_egen;
+
+      if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
         return;
     }
 
   fd_kill (EV_A_ fd);
+
+dec_egen:
+  /* we didn't successfully call epoll_ctl, so decrement the generation counter again */
+  --anfds [fd].egen;
 }
 
 static void
@@ -130,11 +138,15 @@ epoll_poll (EV_P_ ev_tstamp timeout)
     {
       struct epoll_event *ev = epoll_events + i;
 
-      int fd   = ev->data.u64;
+      int fd   = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */
       int got  = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0)
                | (ev->events & (EPOLLIN  | EPOLLERR | EPOLLHUP) ? EV_READ  : 0);
       int want = anfds [fd].events;
 
+      if (anfds [fd].egen != (unsigned char)(ev->data.u64 >> 32))
+        /*fprintf (stderr, "spurious notification fd %d, %d vs %d\n", fd, (int)(ev->data.u64 >> 32), anfds [fd].egen);*/
+        continue;
+
       if (expect_false (got & ~want))
         {
           anfds [fd].emask = want;
-- 
cgit v1.2.3