#ifndef __TESTS_H_INCLUDED
#define __TESTS_H_INCLUDED

#include <cuda_runtime.h>


#define max(a,b) \
	({ __typeof__ (a) _a = (a); \
	   __typeof__ (b) _b = (b); \
	   _a > _b ? _a : _b; })

#define min(a,b) \
	({ __typeof__ (a) _a = (a); \
	   __typeof__ (b) _b = (b); \
	   _a < _b ? _a : _b; })

#define KB 1024ULL
#define MB (1024 * KB)
#define GB (1024 * MB)

#define ELEM_VALUE  1.2345f

#define MIN_THRESHOLD 0
#define DEFAULT_THRESHOLD -1
#define NO_THRESHOLD -1


enum test_numbs {
	// functional tests
	NR_test_memcpy,
	NR_test_saxpy_unified,
	NR_test_host_async,
	NR_test_host_async_batched,
	NR_test_p2p,
	NR_test_dev_sync,

	// performance tests - need to specify test explicitly to run
	NR_test_p2p_bandwidth,
	NR_test_p2p_bandwidth_p2p_off,
	NR_test_p2p_bandwidth_kernel,

	// add new 'test_*' above

	TOTAL_TESTS_COUNT
};

// functional tests
cudaError_t test_memcpy            (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);
cudaError_t test_saxpy_unified     (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);
cudaError_t test_host_async        (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);
cudaError_t test_host_async_batched(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);
cudaError_t test_p2p               (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);
cudaError_t test_dev_sync          (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus);

// performance tests - need to specify test explicitly to run
cudaError_t test_p2p_bandwidth        (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold);
cudaError_t test_p2p_bandwidth_p2p_off(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold);
cudaError_t test_p2p_bandwidth_kernel (size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold);

// add new 'test_*' above


#endif // __TESTS_H_INCLUDED
