/*
 * ptw32_InterlockedCompareExchange.c
 *
 * Description:
 * This translation unit implements routines which are private to
 * the implementation and may be used throughout it.
 *
 * --------------------------------------------------------------------------
 *
 *      Pthreads-win32 - POSIX Threads Library for Win32
 *      Copyright(C) 1998 John E. Bossom
 *      Copyright(C) 1999,2005 Pthreads-win32 contributors
 * 
 *      Contact Email: rpj@callisto.canberra.edu.au
 * 
 *      The current list of contributors is contained
 *      in the file CONTRIBUTORS included with the source
 *      code distribution. The list can also be seen at the
 *      following World Wide Web location:
 *      http://sources.redhat.com/pthreads-win32/contributors.html
 * 
 *      This library is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU Lesser General Public
 *      License as published by the Free Software Foundation; either
 *      version 2 of the License, or (at your option) any later version.
 * 
 *      This library is distributed in the hope that it will be useful,
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *      Lesser General Public License for more details.
 * 
 *      You should have received a copy of the GNU Lesser General Public
 *      License along with this library in the file COPYING.LIB;
 *      if not, write to the Free Software Foundation, Inc.,
 *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 */

#include "pthread.h"
#include "implement.h"


/*
 * ptw32_InterlockedCompareExchange --
 *
 * Originally needed because W9x doesn't support InterlockedCompareExchange.
 * We now use this version wherever possible so we can inline it.
 */

PTW32_INTERLOCKED_LONG WINAPI
ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location,
				  PTW32_INTERLOCKED_LONG value,
				  PTW32_INTERLOCKED_LONG comparand)
{

#if defined(__WATCOMC__)
/* Don't report that result is not assigned a value before being referenced */
#pragma disable_message (200)
#endif

  PTW32_INTERLOCKED_LONG result;

  /*
   * Using the LOCK prefix on uni-processor machines is significantly slower
   * and it is not necessary. The overhead of the conditional below is
   * negligible in comparison. Since an optimised DLL will inline this
   * routine, this will be faster than calling the system supplied
   * Interlocked routine, which appears to avoid the LOCK prefix on
   * uniprocessor systems. So one DLL works for all systems.
   */
  if (ptw32_smp_system)

/* *INDENT-OFF* */

#if defined(_M_IX86) || defined(_X86_)

#if defined(_MSC_VER) || defined(__WATCOMC__) || (defined(__BORLANDC__) && defined(HAVE_TASM32))
#define HAVE_INLINABLE_INTERLOCKED_CMPXCHG
    {
      _asm {
	PUSH         ecx
	PUSH         edx
	MOV          ecx,dword ptr [location]
	MOV          edx,dword ptr [value]
	MOV          eax,dword ptr [comparand]
	LOCK CMPXCHG dword ptr [ecx],edx
	MOV          dword ptr [result], eax
	POP          edx
	POP          ecx
      }
    }
  else
    {
      _asm {
	PUSH         ecx
	PUSH         edx
	MOV          ecx,dword ptr [location]
	MOV          edx,dword ptr [value]
	MOV          eax,dword ptr [comparand]
	CMPXCHG      dword ptr [ecx],edx
	MOV          dword ptr [result], eax
	POP          edx
	POP          ecx
      }
    }

#elif defined(__GNUC__)
#define HAVE_INLINABLE_INTERLOCKED_CMPXCHG

    {
      __asm__ __volatile__
	(
	 "lock\n\t"
	 "cmpxchgl       %2,%1"      /* if (EAX == [location])  */
	                             /*   [location] = value    */
                                     /* else                    */
                                     /*   EAX = [location]      */
	 :"=a" (result)
	 :"m"  (*location), "r" (value), "a" (comparand));
    }
  else
    {
      __asm__ __volatile__
	(
	 "cmpxchgl       %2,%1"      /* if (EAX == [location])  */
	                             /*   [location] = value    */
                                     /* else                    */
                                     /*   EAX = [location]      */
	 :"=a" (result)
	 :"m"  (*location), "r" (value), "a" (comparand));
    }

#endif

#else

  /*
   * If execution gets to here then we're running on a currently
   * unsupported processor or compiler.
   */

  result = 0;

#endif

/* *INDENT-ON* */

  return result;

#if defined(__WATCOMC__)
#pragma enable_message (200)
#endif

}

/*
 * ptw32_InterlockedExchange --
 *
 * We now use this version wherever possible so we can inline it.
 */

LONG WINAPI
ptw32_InterlockedExchange (LPLONG location,
			   LONG value)
{

#if defined(__WATCOMC__)
/* Don't report that result is not assigned a value before being referenced */
#pragma disable_message (200)
#endif

  LONG result;

  /*
   * The XCHG instruction always locks the bus with or without the
   * LOCKED prefix. This makes it significantly slower than CMPXCHG on
   * uni-processor machines. The Windows InterlockedExchange function
   * is nearly 3 times faster than the XCHG instruction, so this routine
   * is not yet very useful for speeding up pthreads.
   */
  if (ptw32_smp_system)

/* *INDENT-OFF* */

#if defined(_M_IX86) || defined(_X86_)

#if defined(_MSC_VER) || defined(__WATCOMC__) || (defined(__BORLANDC__) && defined(HAVE_TASM32))
#define HAVE_INLINABLE_INTERLOCKED_XCHG

    {
      _asm {
	PUSH         ecx
	MOV          ecx,dword ptr [location]
	MOV          eax,dword ptr [value]
	XCHG         dword ptr [ecx],eax
	MOV          dword ptr [result], eax
        POP          ecx
      }
    }
  else
    {
      /*
       * Faster version of XCHG for uni-processor systems because
       * it doesn't lock the bus. If an interrupt or context switch
       * occurs between the MOV and the CMPXCHG then the value in
       * 'location' may have changed, in which case we will loop
       * back to do the MOV again.
       *
       * FIXME! Need memory barriers for the MOV+CMPXCHG combo?
       *
       * Tests show that this routine has almost identical timing
       * to Win32's InterlockedExchange(), which is much faster than
       * using the inlined 'xchg' instruction above, so it's probably
       * doing something similar to this (on UP systems).
       *
       * Can we do without the PUSH/POP instructions?
       */
      _asm {
	PUSH         ecx
	PUSH         edx
	MOV          ecx,dword ptr [location]
	MOV          edx,dword ptr [value]
L1:	MOV          eax,dword ptr [ecx]
	CMPXCHG      dword ptr [ecx],edx
	JNZ          L1
	MOV          dword ptr [result], eax
	POP          edx
        POP          ecx
      }
    }

#elif defined(__GNUC__)
#define HAVE_INLINABLE_INTERLOCKED_XCHG

    {
      __asm__ __volatile__
	(
	 "xchgl          %2,%1"
	 :"=r" (result)
	 :"m"  (*location), "0" (value));
    }
  else
    {
      /*
       * Faster version of XCHG for uni-processor systems because
       * it doesn't lock the bus. If an interrupt or context switch
       * occurs between the movl and the cmpxchgl then the value in
       * 'location' may have changed, in which case we will loop
       * back to do the movl again.
       *
       * FIXME! Need memory barriers for the MOV+CMPXCHG combo?
       *
       * Tests show that this routine has almost identical timing
       * to Win32's InterlockedExchange(), which is much faster than
       * using the an inlined 'xchg' instruction, so it's probably
       * doing something similar to this (on UP systems).
       */
      __asm__ __volatile__
	(
	 "0:\n\t"
	 "movl           %1,%%eax\n\t"
	 "cmpxchgl       %2,%1\n\t"
	 "jnz            0b"
	 :"=&a" (result)
	 :"m"  (*location), "r" (value));
    }

#endif

#else

  /*
   * If execution gets to here then we're running on a currently
   * unsupported processor or compiler.
   */

  result = 0;

#endif

/* *INDENT-ON* */

  return result;

#if defined(__WATCOMC__)
#pragma enable_message (200)
#endif

}


#if 1

#if defined(PTW32_BUILD_INLINED) && defined(HAVE_INLINABLE_INTERLOCKED_CMPXCHG)
#undef PTW32_INTERLOCKED_COMPARE_EXCHANGE
#define PTW32_INTERLOCKED_COMPARE_EXCHANGE ptw32_InterlockedCompareExchange
#endif

#if defined(PTW32_BUILD_INLINED) && defined(HAVE_INLINABLE_INTERLOCKED_XCHG)
#undef PTW32_INTERLOCKED_EXCHANGE
#define PTW32_INTERLOCKED_EXCHANGE ptw32_InterlockedExchange
#endif

#endif