/*  Copyright (c) January 2005 Jean Gressmann (jsg@rz.uni-potsdam.de)
 *
 *  This is free software; you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License as published by
 *	the Free Software Foundation; either version 2 of the License, or
 *	(at your option) any later version. 
 * 
 *	This file is distributed in the hope that it will be useful,
 *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *	GNU General Public License for more details.
 *
 *	You should have received a copy of the GNU General Public License
 *	along with this file; if not, write to the Free Software
 *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifndef X86_64_GCC_H
#define X86_64_GCC_H

#ifndef __GNUC__
#	error "You must use a GNU C++ compatible compiler in order to use this header file!"
#endif


#include <cassert>

PT_NAMESPACE_BEGIN
typedef signed char int8;
typedef unsigned char uint8;
typedef short int16;
typedef unsigned short uint16;
typedef int int32;
typedef unsigned int uint32;
typedef long int64;
typedef unsigned long uint64;

// Most of these functions are due to the Linux kernel.

inline void pt_atomic_clear_lock(volatile uint8* lock)
{
	const uint8 clear = 0;
	__asm__ __volatile__("xchgb %b0, %1" : : "q"(clear), "m" (*lock) : "memory");
}

inline bool pt_atomic_set_lock(volatile uint8* lock)
{
	uint8 prev = 0;
	const uint8 set = 1;
	__asm__ __volatile__("lock; cmpxchgb %b1, %2"
				    : "=a"(prev)
				    : "q"(set), "m"(*lock), "0"(prev)
				    : "memory");
	
	return prev == 0;
}

inline void pt_barrier()
{
	__asm__ __volatile__("mfence": : :"memory");
}

inline int64 pt_atomic_add(volatile int64* counter, int64 value)
{
	const int64 res = value;
	__asm__ __volatile__(
		"lock; xaddq %0, %1"
		:"=r"(value)
		:"m"(*counter), "0"(value)
	);
	return res + value;
}

inline int64 pt_atomic_sub(volatile int64* counter, int64 value)
{
	return pt_atomic_add(counter, -value);
}

inline int64 pt_atomic_inc(volatile int64* counter)
{
	return pt_atomic_add(counter, 1);
}

inline int64 pt_atomic_dec(volatile int64* counter)
{
	return pt_atomic_add(counter, -1);
}

/*
 * Atomic compare and exchange.  Compare OLD with MEM, if identical,
 * store NEW in MEM.  Return the initial value in MEM.  Success is
 * indicated by comparing RETURN with OLD.
 */
inline int64 pt_atomic_cas_return_memory(volatile int64* inMemory, int64 newValue, int64 oldValue)
{
	int64 prev;	
	__asm__ __volatile__(
		"lock; cmpxchgq %1, %2"
		: "=a"(prev)
	    : "q"(newValue), "m"(*inMemory), "0"(oldValue)
	    : "memory"
	);
	return prev;
}

inline bool pt_atomic_cas(volatile int64* mem, int64 nv, int64 ov)
{
	return pt_atomic_cas_return_memory(mem, nv, ov) == ov;
}

inline int64 pt_atomic_set(volatile int64* inMemory, int64 newValue)
{
	__asm__ __volatile__(
		"xchgq %0, %1"
		:"=r" (newValue)
		:"m" (*inMemory), "0" (newValue)
		:"memory"
	);
	return newValue;
}

inline int64 pt_rol(int64 value, unsigned char shift = 1)
{
	__asm__ (
		"rolq %1, %0"
		:"=r"(value)
		:"c"(shift), "0"(value)
		:"cc"
	);
	return value;
}
inline int64 pt_ror(int64 value, unsigned char shift = 1)
{
	__asm__ (
		"rorq %1, %0"
		:"=r"(value)
		:"c"(shift), "0"(value)
		:"cc"
	);
	return value;
}

inline uint64 pt_ticks()
{
	uint32 low, high;
	__asm__ (
		"rdtsc	\n\t"
		"movl %%eax, %0\n\t"
		"movl %%edx, %1\n\t"
		:"=r"(low), "=r"(high)
		:
		:"eax", "edx"
	);
	
	uint64 tsc = high;
	tsc <<= 32;
	tsc |= low;
	return tsc; 
}

inline uint64 pt_seed()
{
	return pt_ticks();
}


PT_NAMESPACE_END

#include <portablethreads/arch/arch-common.h>
#include <portablethreads/arch/manual-empty-bits-pointer-cas.h>
#include <portablethreads/arch/native-atomic-number.h>

PT_NAMESPACE_BEGIN

// AMD64 & Intel's EM64T:
// Both currently use 48 bit for addressing. Hence we may
// safely use most significant 16 bits as well as the least significant 3 bits
// for reference counting. It is assumed that pointers only point to data alligned
// on at least 8 byte boundaries.
// 
// NOTE: Intel's EM64T cpus support cmpxchg16b, AMD's cpus not (yet).


namespace PTPrivate
{
	// Parameter shift:		number of bits to shift to the left to make room for version number
	// Paramenter offset:	ignore lower offset bits (allow assuming pointers have lower bits
	//						set to zero).
	template<unsigned Shift, unsigned Offset = 0>
	struct ShiftLeftN
	{
		typedef pt_pointer_type int_t;
		static inline int_t multiplex(int_t pointer, int_t count)
		{
			// sanity check
			// make sure we have enough bits left ;-)
			assert(Shift - Offset + 48 <= 64);

			// counter must fit into reserved bits
			assert(count <= MUX_MASK);

			// If we assume lower order bits are all zero make sure
			// the assumption holds
			assert((pointer & (((int_t)1<<(Offset))-1)) == 0);

			pointer <<= Offset+Shift;
			assert((pointer & MUX_MASK) == 0);
			return pointer | count;
		}
		static inline int_t count(int_t mux)
		{
			return mux & MUX_MASK;
		}
		static inline int_t value(int_t mux)
		{
			// [pointer|count] -> [0..0|pointer|count-remainder] -> [0..0|pointer|offset zeros]
			return (mux >> Shift+Offset) & DEMUX_MASK;
		}
	private:
		static const int_t MUX_MASK = (static_cast<int_t>(1)<<(Offset+Shift))-1;
		static const int_t DEMUX_MASK = ~((static_cast<int_t>(1)<<(Offset))-1);
	};
}

typedef PTPrivate::PointerCAS< PTPrivate::ShiftLeftN<16, 3> > PTPointerCAS;

PT_NAMESPACE_END

#endif
