/* * ptw32_InterlockedCompareExchange.c * * Description: * This translation unit implements routines which are private to * the implementation and may be used throughout it. * * -------------------------------------------------------------------------- * * Pthreads-win32 - POSIX Threads Library for Win32 * Copyright(C) 1998 John E. Bossom * Copyright(C) 1999,2005 Pthreads-win32 contributors * * Contact Email: rpj@callisto.canberra.edu.au * * The current list of contributors is contained * in the file CONTRIBUTORS included with the source * code distribution. The list can also be seen at the * following World Wide Web location: * http://sources.redhat.com/pthreads-win32/contributors.html * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library in the file COPYING.LIB; * if not, write to the Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "pthread.h" #include "implement.h" /* * ptw32_InterlockedCompareExchange -- * * Originally needed because W9x doesn't support InterlockedCompareExchange. * We now use this version wherever possible so we can inline it. */ PTW32_INTERLOCKED_LONG WINAPI ptw32_InterlockedCompareExchange (PTW32_INTERLOCKED_LPLONG location, PTW32_INTERLOCKED_LONG value, PTW32_INTERLOCKED_LONG comparand) { #if defined(__WATCOMC__) /* Don't report that result is not assigned a value before being referenced */ #pragma disable_message (200) #endif PTW32_INTERLOCKED_LONG result; /* * Using the LOCK prefix on uni-processor machines is significantly slower * and it is not necessary. The overhead of the conditional below is * negligible in comparison. Since an optimised DLL will inline this * routine, this will be faster than calling the system supplied * Interlocked routine, which appears to avoid the LOCK prefix on * uniprocessor systems. So one DLL works for all systems. */ if (ptw32_smp_system) /* *INDENT-OFF* */ #if defined(_M_IX86) || defined(_X86_) #if defined(_MSC_VER) || defined(__WATCOMC__) || (defined(__BORLANDC__) && defined(HAVE_TASM32)) #define HAVE_INLINABLE_INTERLOCKED_CMPXCHG { _asm { PUSH ecx PUSH edx MOV ecx,dword ptr [location] MOV edx,dword ptr [value] MOV eax,dword ptr [comparand] LOCK CMPXCHG dword ptr [ecx],edx MOV dword ptr [result], eax POP edx POP ecx } } else { _asm { PUSH ecx PUSH edx MOV ecx,dword ptr [location] MOV edx,dword ptr [value] MOV eax,dword ptr [comparand] CMPXCHG dword ptr [ecx],edx MOV dword ptr [result], eax POP edx POP ecx } } #elif defined(__GNUC__) #define HAVE_INLINABLE_INTERLOCKED_CMPXCHG { __asm__ __volatile__ ( "lock\n\t" "cmpxchgl %2,%1" /* if (EAX == [location]) */ /* [location] = value */ /* else */ /* EAX = [location] */ :"=a" (result) :"m" (*location), "r" (value), "a" (comparand)); } else { __asm__ __volatile__ ( "cmpxchgl %2,%1" /* if (EAX == [location]) */ /* [location] = value */ /* else */ /* EAX = [location] */ :"=a" (result) :"m" (*location), "r" (value), "a" (comparand)); } #endif #else /* * If execution gets to here then we're running on a currently * unsupported processor or compiler. */ result = 0; #endif /* *INDENT-ON* */ return result; #if defined(__WATCOMC__) #pragma enable_message (200) #endif } /* * ptw32_InterlockedExchange -- * * We now use this version wherever possible so we can inline it. */ LONG WINAPI ptw32_InterlockedExchange (LPLONG location, LONG value) { #if defined(__WATCOMC__) /* Don't report that result is not assigned a value before being referenced */ #pragma disable_message (200) #endif LONG result; /* * The XCHG instruction always locks the bus with or without the * LOCKED prefix. This makes it significantly slower than CMPXCHG on * uni-processor machines. The Windows InterlockedExchange function * is nearly 3 times faster than the XCHG instruction, so this routine * is not yet very useful for speeding up pthreads. */ if (ptw32_smp_system) /* *INDENT-OFF* */ #if defined(_M_IX86) || defined(_X86_) #if defined(_MSC_VER) || defined(__WATCOMC__) || (defined(__BORLANDC__) && defined(HAVE_TASM32)) #define HAVE_INLINABLE_INTERLOCKED_XCHG { _asm { PUSH ecx MOV ecx,dword ptr [location] MOV eax,dword ptr [value] XCHG dword ptr [ecx],eax MOV dword ptr [result], eax POP ecx } } else { /* * Faster version of XCHG for uni-processor systems because * it doesn't lock the bus. If an interrupt or context switch * occurs between the MOV and the CMPXCHG then the value in * 'location' may have changed, in which case we will loop * back to do the MOV again. * * FIXME! Need memory barriers for the MOV+CMPXCHG combo? * * Tests show that this routine has almost identical timing * to Win32's InterlockedExchange(), which is much faster than * using the inlined 'xchg' instruction above, so it's probably * doing something similar to this (on UP systems). * * Can we do without the PUSH/POP instructions? */ _asm { PUSH ecx PUSH edx MOV ecx,dword ptr [location] MOV edx,dword ptr [value] L1: MOV eax,dword ptr [ecx] CMPXCHG dword ptr [ecx],edx JNZ L1 MOV dword ptr [result], eax POP edx POP ecx } } #elif defined(__GNUC__) #define HAVE_INLINABLE_INTERLOCKED_XCHG { __asm__ __volatile__ ( "xchgl %2,%1" :"=r" (result) :"m" (*location), "0" (value)); } else { /* * Faster version of XCHG for uni-processor systems because * it doesn't lock the bus. If an interrupt or context switch * occurs between the movl and the cmpxchgl then the value in * 'location' may have changed, in which case we will loop * back to do the movl again. * * FIXME! Need memory barriers for the MOV+CMPXCHG combo? * * Tests show that this routine has almost identical timing * to Win32's InterlockedExchange(), which is much faster than * using the an inlined 'xchg' instruction, so it's probably * doing something similar to this (on UP systems). */ __asm__ __volatile__ ( "0:\n\t" "movl %1,%%eax\n\t" "cmpxchgl %2,%1\n\t" "jnz 0b" :"=&a" (result) :"m" (*location), "r" (value)); } #endif #else /* * If execution gets to here then we're running on a currently * unsupported processor or compiler. */ result = 0; #endif /* *INDENT-ON* */ return result; #if defined(__WATCOMC__) #pragma enable_message (200) #endif } #if 1 #if defined(PTW32_BUILD_INLINED) && defined(HAVE_INLINABLE_INTERLOCKED_CMPXCHG) #undef PTW32_INTERLOCKED_COMPARE_EXCHANGE #define PTW32_INTERLOCKED_COMPARE_EXCHANGE ptw32_InterlockedCompareExchange #endif #if defined(PTW32_BUILD_INLINED) && defined(HAVE_INLINABLE_INTERLOCKED_XCHG) #undef PTW32_INTERLOCKED_EXCHANGE #define PTW32_INTERLOCKED_EXCHANGE ptw32_InterlockedExchange #endif #endif