From 9da8fdcb33373b4b2e1de2a8b7af3ed4b5811245 Mon Sep 17 00:00:00 2001
From: rpj <rpj>
Date: Fri, 8 Oct 2004 12:03:18 +0000
Subject: Mutex speedups

---
 ChangeLog                          |  27 ++++++
 GNUmakefile                        |  11 +--
 Makefile                           |   6 +-
 implement.h                        |  11 ++-
 private.c                          |   3 +-
 pthread_barrier_wait.c             |   2 +-
 pthread_mutex_destroy.c            |   1 -
 pthread_mutex_init.c               |   4 -
 pthread_mutex_lock.c               |  69 +++++++++++----
 pthread_mutex_timedlock.c          | 175 ++++++++++++++++---------------------
 pthread_mutex_trylock.c            |   2 +-
 pthread_mutex_unlock.c             |  22 ++---
 pthread_spin_destroy.c             |   2 +-
 pthread_spin_lock.c                |   2 +-
 pthread_spin_trylock.c             |   2 +-
 pthread_spin_unlock.c              |   2 +-
 ptw32_InterlockedCompareExchange.c |  12 ++-
 tests/rwlock7.c                    |   2 +-
 18 files changed, 197 insertions(+), 158 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 9a3cd4f..ec65d84 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2004-10-08  Ross Johnson  <rpj at callisto.canberra.edu.au>
+
+	* pthread_mutex_destroy.c (pthread_mutex_destroy): Critical Section
+	element is no longer required.
+	* pthread_mutex_init.c (pthread_mutex_init): Likewise.
+	* pthread_mutex_lock.c (pthread_mutex_lock): New algorithm following Drepper's
+	paper at http://people.redhat.com/drepper/futex.pdf, but using the existing
+	semaphore in place of the futex described in the paper. Idea suggested by
+	Alexander Terekhov - see:
+	http://sources.redhat.com/ml/pthreads-win32/2003/msg00108.html
+	* pthread_mutex_timedlock.c pthread_mutex_timedlock(): Similarly.
+	* pthread_mutex_trylock.c (pthread_mutex_trylock): Similarly.
+	* pthread_mutex_unlock.c (pthread_mutex_unlock): Similarly.
+	* pthread_barrier_wait.c (pthread_barrier_wait): Use inlined version of
+	InterlockedCompareExchange() if possible - determined at build-time.
+	* pthread_spin_destroy.c pthread_spin_destroy(): Likewise.
+	* pthread_spin_lock.c pthread_spin_lock():Likewise.
+	* pthread_spin_trylock.c (pthread_spin_trylock):Likewise.
+	* pthread_spin_unlock.c (pthread_spin_unlock):Likewise.
+	* ptw32_InterlockedCompareExchange.c: Sets up macro for inlined use.
+	* implement.h (pthread_mutex_t_): Remove Critical Section element.
+	(PTW32_INTERLOCKED_COMPARE_EXCHANGE): Set to default non-inlined version of
+	InterlockedCompareExchange().
+	* private.c: Include ptw32_InterlockedCompareExchange.c first for inlining.
+	* GNUmakefile: Add commandline option to use inlined InterlockedCompareExchange().
+	* Makefile: Likewise.
+
 2004-09-27  Ross Johnson  <rpj at callisto.canberra.edu.au>
 
 	* pthread_mutex_lock.c (pthread_mutex_lock): Separate PTHREAD_MUTEX_NORMAL
diff --git a/GNUmakefile b/GNUmakefile
index b07420f..f479778 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -49,6 +49,7 @@ AR	= ar
 #OPT	= -g
 #OPT	= -O3 -DTEST_ICE
 OPT	= -O3 -finline-functions
+XOPT	=
 
 LFLAGS		= -lwsock32
 
@@ -57,7 +58,7 @@ GCE_CFLAGS	= -D__CLEANUP_CXX -mthreads
 
 ## Mingw32
 MAKE		= make
-CFLAGS	= $(OPT) -I. -DHAVE_CONFIG_H -Wall
+CFLAGS	= $(OPT) $(XOPT) -I. -DHAVE_CONFIG_H -Wall
 
 DLL_INLINED_OBJS	= \
 		pthread.o
@@ -414,10 +415,10 @@ GCE:
 		$(MAKE) CC=g++ CLEANUP_FLAGS="$(GCE_CFLAGS)" OBJ="$(DLL_OBJS)" $(GCE_DLL)
 
 GC-inlined:
-		$(MAKE) CC=gcc CLEANUP_FLAGS="$(GC_CFLAGS)" OBJ="$(DLL_INLINED_OBJS)" $(GC_INLINED_STAMP)
+		$(MAKE) CC=gcc XOPT="-DPTW32_BUILD_INLINED" CLEANUP_FLAGS="$(GC_CFLAGS)" OBJ="$(DLL_INLINED_OBJS)" $(GC_INLINED_STAMP)
 
 GCE-inlined:
-		$(MAKE) CC=g++ CLEANUP_FLAGS="$(GCE_CFLAGS)" OBJ="$(DLL_INLINED_OBJS)" $(GCE_INLINED_STAMP)
+		$(MAKE) CC=g++ XOPT="-DPTW32_BUILD_INLINED" CLEANUP_FLAGS="$(GCE_CFLAGS)" OBJ="$(DLL_INLINED_OBJS)" $(GCE_INLINED_STAMP)
 
 tests:
 	@ cd tests
@@ -445,13 +446,13 @@ $(GCE_DLL): $(DLL_OBJS)
 	dlltool -k --dllname $@ --output-lib $(GCE_LIB) --def $(PTHREAD_DEF)
 
 $(GC_INLINED_STAMP): $(DLL_INLINED_OBJS)
-	$(CC) $(OPT) -shared -o $(GC_DLL) $(DLL_INLINED_OBJS) $(LFLAGS)
+	$(CC) $(OPT) $(XOPT) -shared -o $(GC_DLL) $(DLL_INLINED_OBJS) $(LFLAGS)
 	dlltool -z pthread.def $(DLL_INLINED_OBJS)
 	dlltool -k --dllname $(GC_DLL) --output-lib $(GC_LIB) --def $(PTHREAD_DEF)
 	echo touched > $(GC_INLINED_STAMP)
 
 $(GCE_INLINED_STAMP): $(DLL_INLINED_OBJS)
-	$(CC) $(OPT) -mthreads -shared -o $(GCE_DLL) $(DLL_INLINED_OBJS)  $(LFLAGS)
+	$(CC) $(OPT) $(XOPT) -mthreads -shared -o $(GCE_DLL) $(DLL_INLINED_OBJS)  $(LFLAGS)
 	dlltool -z pthread.def $(DLL_INLINED_OBJS)
 	dlltool -k --dllname $(GCE_DLL) --output-lib $(GCE_LIB) --def $(PTHREAD_DEF)
 	echo touched > $(GCE_INLINED_STAMP)
diff --git a/Makefile b/Makefile
index f8650b2..9fb97bb 100644
--- a/Makefile
+++ b/Makefile
@@ -375,13 +375,13 @@ VC:
 # inlining optimisation turned on.
 #
 VCE-inlined:
-	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VCEFLAGS)" pthreadVCE.stamp
+	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VCEFLAGS) /DPTW32_BUILD_INLINED" pthreadVCE.stamp
 
 VSE-inlined:
-	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VSEFLAGS)" pthreadVSE.stamp
+	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VSEFLAGS) /DPTW32_BUILD_INLINED" pthreadVSE.stamp
 
 VC-inlined:
-	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VCFLAGS)" pthreadVC.stamp
+	@ nmake /nologo EHFLAGS="/O2 /Ob1 $(VCFLAGS) /DPTW32_BUILD_INLINED" pthreadVC.stamp
 
 realclean: clean
 	if exist *.dll del *.dll
diff --git a/implement.h b/implement.h
index 400598c..2e320bb 100644
--- a/implement.h
+++ b/implement.h
@@ -192,8 +192,6 @@ struct pthread_mutex_t_
   pthread_t ownerThread;
   sem_t wait_sema;		/* Mutex release notification to waiting
 				   threads. */
-  CRITICAL_SECTION wait_cs;	/* Serialise lock_idx decrement after mutex
-				   timeout. */
 };
 
 struct pthread_mutexattr_t_
@@ -574,6 +572,15 @@ extern "C"
 #   include <process.h>
 #endif
 
+
+/*
+ * When not building the inlined version of the dll.
+ */
+#ifndef PTW32_INTERLOCKED_COMPARE_EXCHANGE
+#define PTW32_INTERLOCKED_COMPARE_EXCHANGE ptw32_interlocked_compare_exchange
+#endif
+
+
 /*
  * Check for old and new versions of cygwin. See the FAQ file:
  *
diff --git a/private.c b/private.c
index 6f59e5c..d6171b4 100644
--- a/private.c
+++ b/private.c
@@ -38,6 +38,8 @@
 #include "pthread.h"
 #include "implement.h"
 
+/* Must be first to define HAVE_INLINABLE_INTERLOCKED_CMPXCHG */
+#include "ptw32_InterlockedCompareExchange.c"
 
 #include "ptw32_is_attr.c"
 #include "ptw32_processInitialize.c"
@@ -50,5 +52,4 @@
 #include "ptw32_semwait.c"
 #include "ptw32_timespec.c"
 #include "ptw32_throw.c"
-#include "ptw32_InterlockedCompareExchange.c"
 #include "ptw32_getprocessors.c"
diff --git a/pthread_barrier_wait.c b/pthread_barrier_wait.c
index 200a02d..37c243b 100644
--- a/pthread_barrier_wait.c
+++ b/pthread_barrier_wait.c
@@ -86,7 +86,7 @@ pthread_barrier_wait (pthread_barrier_t * barrier)
   if (0 == result)
     {
       result = ((PTW32_INTERLOCKED_LONG) step ==
-		ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG)
+		PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG)
 						    & (b->iStep),
 						    (PTW32_INTERLOCKED_LONG)
 						    (1L - step),
diff --git a/pthread_mutex_destroy.c b/pthread_mutex_destroy.c
index c2289d0..1ff9ebe 100644
--- a/pthread_mutex_destroy.c
+++ b/pthread_mutex_destroy.c
@@ -83,7 +83,6 @@ pthread_mutex_destroy (pthread_mutex_t * mutex)
 	      if (result == 0)
 		{
 		  (void) sem_destroy (&mx->wait_sema);
-		  DeleteCriticalSection (&mx->wait_cs);
 		  free (mx);
 		}
 	      else
diff --git a/pthread_mutex_init.c b/pthread_mutex_init.c
index 93504ad..fdb6017 100644
--- a/pthread_mutex_init.c
+++ b/pthread_mutex_init.c
@@ -92,10 +92,6 @@ pthread_mutex_init (pthread_mutex_t * mutex, const pthread_mutexattr_t * attr)
 	  free (mx);
 	  mx = NULL;
 	}
-      else
-	{
-	  InitializeCriticalSection (&mx->wait_cs);
-	}
     }
 
   *mutex = mx;
diff --git a/pthread_mutex_lock.c b/pthread_mutex_lock.c
index b733801..6695907 100644
--- a/pthread_mutex_lock.c
+++ b/pthread_mutex_lock.c
@@ -44,6 +44,7 @@ int
 pthread_mutex_lock (pthread_mutex_t * mutex)
 {
   int result = 0;
+  LONG c;
   pthread_mutex_t mx;
 
   /*
@@ -68,29 +69,48 @@ pthread_mutex_lock (pthread_mutex_t * mutex)
 
   if (mx->kind == PTHREAD_MUTEX_NORMAL)
     {
-      if (0 != InterlockedIncrement (&mx->lock_idx))
+      if ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+		        (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		        (PTW32_INTERLOCKED_LONG) 0,
+		        (PTW32_INTERLOCKED_LONG) -1)) != -1)
 	{
-	  if (ptw32_semwait (&mx->wait_sema) != 0)
+	  do
 	    {
-	      result = errno;
+	      if (c == 1 ||
+		  (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+		           (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		           (PTW32_INTERLOCKED_LONG) 1,
+		           (PTW32_INTERLOCKED_LONG) 0) != -1)
+		{
+		  if (ptw32_semwait (&mx->wait_sema) != 0)
+		    {
+		      result = errno;
+		      break;
+		    }
+		}
 	    }
+	  while ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                               (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		               (PTW32_INTERLOCKED_LONG) 1,
+		               (PTW32_INTERLOCKED_LONG) -1)) != -1);
 	}
     }
   else
     {
-      if (0 == InterlockedIncrement (&mx->lock_idx))
+      pthread_t self = pthread_self();
+
+      if ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                        (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		        (PTW32_INTERLOCKED_LONG) 0,
+		        (PTW32_INTERLOCKED_LONG) -1)) == -1)
 	{
 	  mx->recursive_count = 1;
-	  mx->ownerThread = pthread_self ();
+	  mx->ownerThread = self;
 	}
       else
 	{
-	  pthread_t self = pthread_self();
-
 	  if (pthread_equal (mx->ownerThread, self))
 	    {
-	      (void) InterlockedDecrement (&mx->lock_idx);
-	      
 	      if (mx->kind == PTHREAD_MUTEX_RECURSIVE)
 		{
 		  mx->recursive_count++;
@@ -102,15 +122,30 @@ pthread_mutex_lock (pthread_mutex_t * mutex)
 	    }
 	  else
 	    {
-	      if (ptw32_semwait (&mx->wait_sema) == 0)
+	      do
 		{
-		  mx->recursive_count = 1;
-		  mx->ownerThread = self;
-		}
-	      else
-		{
-		  result = errno;
-		}
+		  if (c == 1 ||
+		      (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                               (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		               (PTW32_INTERLOCKED_LONG) 1,
+		               (PTW32_INTERLOCKED_LONG) 0) != -1)
+	            {
+		      if (ptw32_semwait (&mx->wait_sema) == 0)
+		        {
+		          mx->recursive_count = 1;
+		          mx->ownerThread = self;
+		        }
+		      else
+		        {
+		          result = errno;
+		          break;
+		        }
+		    }
+	        }
+	      while ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                                   (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		                   (PTW32_INTERLOCKED_LONG) 1,
+		                   (PTW32_INTERLOCKED_LONG) -1)) != -1);
 	    }
 	}
     }
diff --git a/pthread_mutex_timedlock.c b/pthread_mutex_timedlock.c
index 58f5613..cf929bc 100644
--- a/pthread_mutex_timedlock.c
+++ b/pthread_mutex_timedlock.c
@@ -206,7 +206,7 @@ int
 pthread_mutex_timedlock (pthread_mutex_t * mutex,
 			 const struct timespec *abstime)
 {
-  int result = 0;
+  LONG c;
   pthread_mutex_t mx;
 
 #ifdef NEED_SEM
@@ -226,6 +226,8 @@ pthread_mutex_timedlock (pthread_mutex_t * mutex,
    */
   if (*mutex >= PTHREAD_ERRORCHECK_MUTEX_INITIALIZER)
     {
+      int result;
+
       if ((result = ptw32_mutex_check_need_init (mutex)) != 0)
 	{
 	  return (result);
@@ -236,63 +238,51 @@ pthread_mutex_timedlock (pthread_mutex_t * mutex,
 
   if (mx->kind == PTHREAD_MUTEX_NORMAL)
     {
-      if (0 != InterlockedIncrement (&mx->lock_idx))
+      if ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+		        (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		        (PTW32_INTERLOCKED_LONG) 0,
+		        (PTW32_INTERLOCKED_LONG) -1)) != -1)
 	{
-	  switch (ptw32_timed_semwait (&mx->wait_sema, abstime))
-	    {
-	      case 0:	/* We got the mutex. */
-		{
-		  break;
-		}
-	      case 1:	/* Timed out. */
-	      case 2:	/* abstime passed before we started to wait. */
-		{
-		  /*
-		   * If we timeout, it is up to us to adjust lock_idx to say
-		   * we're no longer waiting.
-		   *
-		   * The owner thread may still have posted wait_sema thinking
-		   * we were waiting. We must check but then NOT do any
-		   * programmed work if we have acquired the mutex because
-		   * we don't know how long ago abstime was. We MUST just release it
-		   * immediately.
-		   */
-		  EnterCriticalSection (&mx->wait_cs);
-
-		  result = ETIMEDOUT;
-
-		  if (-1 == sem_trywait (&mx->wait_sema))
+          do
+            {
+              if (c == 1 ||
+                  (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                           (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+                           (PTW32_INTERLOCKED_LONG) 1,
+                           (PTW32_INTERLOCKED_LONG) 0) != -1)
+                {
+		  switch (ptw32_timed_semwait (&mx->wait_sema, abstime))
 		    {
-		      (void) InterlockedDecrement (&mx->lock_idx);
-		    }
-		  else
-		    {
-		      if (InterlockedDecrement (&mx->lock_idx) >= 0)
-			{
-			  /* Someone else is waiting on that mutex */
-			  if (sem_post (&mx->wait_sema) != 0)
-			    {
-			      result = errno;
-			    }
-			}
-		    }
-
-		  LeaveCriticalSection (&mx->wait_cs);
-		  break;
-		}
-	      default:
-		{
-		  result = errno;
-		  break;
+		    case 0:	/* We got woken up so try get the lock again. */
+		      {
+		        break;
+		      }
+		    case 1:	/* Timed out. */
+		    case 2:	/* abstime passed before we started to wait. */
+		      {
+		        return ETIMEDOUT;
+		      }
+		    default:
+		      {
+		        return errno;
+		      }
+		  }
 		}
-	    }
+            }
+          while ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                               (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+                               (PTW32_INTERLOCKED_LONG) 1,
+                               (PTW32_INTERLOCKED_LONG) -1)) != -1);
 	}
     }
   else
     {
       pthread_t self = pthread_self();
 
-      if (0 == InterlockedIncrement (&mx->lock_idx))
+      if ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                        (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+		        (PTW32_INTERLOCKED_LONG) 0,
+		        (PTW32_INTERLOCKED_LONG) -1)) == -1)
 	{
 	  mx->recursive_count = 1;
 	  mx->ownerThread = self;
@@ -301,72 +291,53 @@ pthread_mutex_timedlock (pthread_mutex_t * mutex,
 	{
 	  if (pthread_equal (mx->ownerThread, self))
 	    {
-	      (void) InterlockedDecrement (&mx->lock_idx);
-
 	      if (mx->kind == PTHREAD_MUTEX_RECURSIVE)
 		{
 		  mx->recursive_count++;
 		}
 	      else
 		{
-		  result = EDEADLK;
+		  return EDEADLK;
 		}
 	    }
 	  else
 	    {
-	      switch (ptw32_timed_semwait (&mx->wait_sema, abstime))
-		{
-		  case 0:	/* We got the mutex. */
-		    {
-		      mx->recursive_count = 1;
-		      mx->ownerThread = self;
-		      break;
+              do
+                {
+                  if (c == 1 ||
+                      (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                               (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+                               (PTW32_INTERLOCKED_LONG) 1,
+                               (PTW32_INTERLOCKED_LONG) 0) != -1)
+                    {
+		      switch (ptw32_timed_semwait (&mx->wait_sema, abstime))
+		        {
+		        case 0:	/* We got woken up so try get the lock again. */
+		          {
+		            break;
+		          }
+		        case 1:	/* Timed out. */
+		        case 2:	/* abstime passed before we started to wait. */
+		          {
+		            return ETIMEDOUT;
+		          }
+		        default:
+		          {
+		            return errno;
+		          }
+		      }
 		    }
-		  case 1:	/* Timedout. */
-		  case 2:	/* abstime passed before we started to wait. */
-		    {
-		      /*
-		       * If we timeout, it is up to us to adjust lock_idx to say
-		       * we're no longer waiting.
-		       *
-		       * The owner thread may still have posted wait_sema thinking
-		       * we were waiting. We must check but then NOT do any
-		       * programmed work if we have acquired the mutex because
-		       * we don't know how long ago abstime was. We MUST just release it
-		       * immediately.
-		       */
-		      EnterCriticalSection (&mx->wait_cs);
-
-		      result = ETIMEDOUT;
-
-		      if (-1 == sem_trywait (&mx->wait_sema))
-			{
-			  (void) InterlockedDecrement (&mx->lock_idx);
-			}
-		     else
-			{
-			  if (InterlockedDecrement (&mx->lock_idx) >= 0)
-			    {
-			      /* Someone else is waiting on that mutex */
-			      if (sem_post (&mx->wait_sema) != 0)
-				{
-				  result = errno;
-				}
-			    }
-			}
-
-		      LeaveCriticalSection (&mx->wait_cs);
-		      break;
-		    }
-		  default:
-		    {
-		      result = errno;
-		      break;
-		    }
-		}
+                }
+              while ((c = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE(
+                                   (PTW32_INTERLOCKED_LPLONG) &mx->lock_idx,
+                                   (PTW32_INTERLOCKED_LONG) 1,
+                                   (PTW32_INTERLOCKED_LONG) -1)) != -1);
+
+	      mx->recursive_count = 1;
+	      mx->ownerThread = self;
 	    }
 	}
     }
 
-  return (result);
+  return 0;
 }
diff --git a/pthread_mutex_trylock.c b/pthread_mutex_trylock.c
index bba8ed2..e8ea57b 100644
--- a/pthread_mutex_trylock.c
+++ b/pthread_mutex_trylock.c
@@ -65,7 +65,7 @@ pthread_mutex_trylock (pthread_mutex_t * mutex)
   mx = *mutex;
 
   if ((PTW32_INTERLOCKED_LONG) -1 ==
-      ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG) &
+      PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG) &
 					  mx->lock_idx,
 					  (PTW32_INTERLOCKED_LONG) 0,
 					  (PTW32_INTERLOCKED_LONG) -1))
diff --git a/pthread_mutex_unlock.c b/pthread_mutex_unlock.c
index 7b20d79..d853178 100644
--- a/pthread_mutex_unlock.c
+++ b/pthread_mutex_unlock.c
@@ -61,7 +61,7 @@ pthread_mutex_unlock (pthread_mutex_t * mutex)
 	{
 	  LONG idx;
 
-	  idx = (LONG) ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG)
+	  idx = (LONG) PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG)
 	  					           &mx->lock_idx,
 						           (PTW32_INTERLOCKED_LONG) -1,
 						           (PTW32_INTERLOCKED_LONG) 0);
@@ -70,18 +70,12 @@ pthread_mutex_unlock (pthread_mutex_t * mutex)
 	    {
 	      if (idx > 0)
 		{
-		  EnterCriticalSection (&mx->wait_cs);
-
-		  if (InterlockedDecrement (&mx->lock_idx) >= 0)
+		  mx->lock_idx = -1;
+		  /* Someone may be waiting on that mutex */
+		  if (sem_post (&mx->wait_sema) != 0)
 		    {
-		      /* Someone is waiting on that mutex */
-		        if (sem_post (&mx->wait_sema) != 0)
-			{
-			  result = errno;
-	 		}
+		      result = errno;
 		    }
-
-		  LeaveCriticalSection (&mx->wait_cs);
 	        }
 	      else
 		{
@@ -100,18 +94,16 @@ pthread_mutex_unlock (pthread_mutex_t * mutex)
 		  || 0 == --mx->recursive_count)
 		{
 		  mx->ownerThread = NULL;
-		  EnterCriticalSection (&mx->wait_cs);
 
 		  if (InterlockedDecrement (&mx->lock_idx) >= 0)
 		    {
-		      /* Someone is waiting on that mutex */
+		      /* Someone may be waiting on that mutex */
+		      mx->lock_idx = -1;
 		      if (sem_post (&mx->wait_sema) != 0)
 			{
 			  result = errno;
 			}
 		    }
-
-		  LeaveCriticalSection (&mx->wait_cs);
 		}
 	    }
 	  else
diff --git a/pthread_spin_destroy.c b/pthread_spin_destroy.c
index 3347597..1b84b0f 100644
--- a/pthread_spin_destroy.c
+++ b/pthread_spin_destroy.c
@@ -56,7 +56,7 @@ pthread_spin_destroy (pthread_spinlock_t * lock)
 	  result = pthread_mutex_destroy (&(s->u.mutex));
 	}
       else if ((PTW32_INTERLOCKED_LONG) PTW32_SPIN_UNLOCKED !=
-	       ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG)
+	       PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG)
 						   & (s->interlock),
 						   (PTW32_INTERLOCKED_LONG)
 						   PTW32_OBJECT_INVALID,
diff --git a/pthread_spin_lock.c b/pthread_spin_lock.c
index 29c6a62..d8b088f 100644
--- a/pthread_spin_lock.c
+++ b/pthread_spin_lock.c
@@ -61,7 +61,7 @@ pthread_spin_lock (pthread_spinlock_t * lock)
   s = *lock;
 
   while ((PTW32_INTERLOCKED_LONG) PTW32_SPIN_LOCKED ==
-	 ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG) &
+	 PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG) &
 					     (s->interlock),
 					     (PTW32_INTERLOCKED_LONG)
 					     PTW32_SPIN_LOCKED,
diff --git a/pthread_spin_trylock.c b/pthread_spin_trylock.c
index 040a2b0..6f13cbc 100644
--- a/pthread_spin_trylock.c
+++ b/pthread_spin_trylock.c
@@ -61,7 +61,7 @@ pthread_spin_trylock (pthread_spinlock_t * lock)
   s = *lock;
 
   switch ((long)
-	  ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG) &
+	  PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG) &
 					      (s->interlock),
 					      (PTW32_INTERLOCKED_LONG)
 					      PTW32_SPIN_LOCKED,
diff --git a/pthread_spin_unlock.c b/pthread_spin_unlock.c
index 7b40f7f..298e61f 100644
--- a/pthread_spin_unlock.c
+++ b/pthread_spin_unlock.c
@@ -56,7 +56,7 @@ pthread_spin_unlock (pthread_spinlock_t * lock)
     }
 
   switch ((long)
-	  ptw32_interlocked_compare_exchange ((PTW32_INTERLOCKED_LPLONG) &
+	  PTW32_INTERLOCKED_COMPARE_EXCHANGE ((PTW32_INTERLOCKED_LPLONG) &
 					      (s->interlock),
 					      (PTW32_INTERLOCKED_LONG)
 					      PTW32_SPIN_UNLOCKED,
diff --git a/ptw32_InterlockedCompareExchange.c b/ptw32_InterlockedCompareExchange.c
index 299fc36..e3c5162 100644
--- a/ptw32_InterlockedCompareExchange.c
+++ b/ptw32_InterlockedCompareExchange.c
@@ -51,7 +51,7 @@
  * we can call it through a pointer.
  */
 
-PTW32_INTERLOCKED_LONG WINAPI
+INLINE PTW32_INTERLOCKED_LONG WINAPI
 ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
 				  PTW32_INTERLOCKED_LONG value,
 				  PTW32_INTERLOCKED_LONG comparand)
@@ -69,6 +69,7 @@ ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
 #if defined(_M_IX86) || defined(_X86_)
 
 #if defined(_MSC_VER) || defined(__WATCOMC__)
+#define HAVE_INLINABLE_INTERLOCKED_CMPXCHG
 
   _asm {
     PUSH         ecx
@@ -86,6 +87,7 @@ ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
   }
 
 #elif defined(__BORLANDC__)
+#define HAVE_INLINABLE_INTERLOCKED_CMPXCHG
 
   _asm {
     PUSH	 ecx
@@ -103,6 +105,7 @@ ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
   }
 
 #elif defined(__GNUC__)
+#define HAVE_INLINABLE_INTERLOCKED_CMPXCHG
 
   __asm__
     (
@@ -139,3 +142,10 @@ ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
 #endif
 
 }
+
+#if 0
+#if defined(PTW32_BUILD_INLINED) && defined(HAVE_INLINABLE_INTERLOCKED_CMPXCHG)
+#undef PTW32_INTERLOCKED_COMPARE_EXCHANGE
+#define PTW32_INTERLOCKED_COMPARE_EXCHANGE ptw32_InterlockedCompareExchange
+#endif
+#endif
diff --git a/tests/rwlock7.c b/tests/rwlock7.c
index 9c6a5f1..8706d4a 100644
--- a/tests/rwlock7.c
+++ b/tests/rwlock7.c
@@ -151,7 +151,7 @@ main (int argc, char *argv[])
     {
       assert(pthread_join (threads[count].thread_id, NULL) == 0);
       thread_updates += threads[count].updates;
-      printf ("%02d: interval %d, updates %d, reads %d\n",
+      printf ("\n%02d: interval %d, updates %d, reads %d\n",
               count, threads[count].interval,
               threads[count].updates, threads[count].reads);
     }
-- 
cgit v1.2.3