#include "error.h"
#include "tests.cuh"

#include <cuda_runtime.h>

#include <assert.h>
#include <errno.h>
#include <float.h>
#include <malloc.h>
#include <math.h>
#include <string.h>


#define REPETITION_COUNT 5

// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
#define MAX_GRID_X_DIM ((1ULL << 31) - 1)
#define MAX_BLOCK_SIZE 1024ULL

#define ARR_SIZE(n) ((n) * sizeof(ELEM_VALUE))

#define SAXPY_ALPHA      ELEM_VALUE
#define SAXPY_VEC_X_ELEM (ELEM_VALUE * 2)
#define SAXPY_VEC_Y_ELEM (ELEM_VALUE * 3)
#define SAXPY_RES_ELEM   (SAXPY_ALPHA * SAXPY_VEC_X_ELEM + SAXPY_VEC_Y_ELEM)

#define DOUBLED_ELEM_VALUE  (2 * ELEM_VALUE)
#define STREAMS_NUMB        8
#define SLICE_ELEMS_NUMB(n) (((n) + (STREAMS_NUMB) - 1) / STREAMS_NUMB)
#define SLICE_SIZE(n)       (SLICE_ELEMS_NUMB(n) * sizeof(DOUBLED_ELEM_VALUE))

#define __blocks_numb(elems_numb, block_size) min((((elems_numb) + (block_size) - 1) / (block_size)), MAX_GRID_X_DIM)

#define blocks_numb(n, block_size)        __blocks_numb((n), block_size)
#define blocks_numb_sliced(n, block_size) __blocks_numb(SLICE_ELEMS_NUMB(n), block_size)


// https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
// https://bitbashing.io/comparing-floats.html
__host__ __device__ int almost_equal_rel(float a, float b, float max_rel_diff)
{
	float diff = fabs(a - b);

	// scale epsilon to the largest value
	a = fabs(a);
	b = fabs(b);
	float scaled_eps = max_rel_diff * fmaxf(a, b);

	return diff <= scaled_eps;
}

static void fill_host_buffer(float *buf, size_t elems_numb, float val)
{
	for (size_t i = 0; i < elems_numb; i++)
		buf[i] = val;

	return;
}

static __global__ void check_dev_buffer(const float *buf, size_t elems_numb, float val)
{
	size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
	size_t stride = (size_t)blockDim.x * gridDim.x;

	for (size_t i = index; i < elems_numb; i += stride)
		assert(almost_equal_rel(buf[i], val, FLT_EPSILON));
}

static int check_host_buffer(const float *buf, size_t elems_numb, float val, size_t *fault_idx)
{
	for (size_t i = 0; i < elems_numb; i++)
		if (!almost_equal_rel(buf[i], val, FLT_EPSILON)) {
			if (fault_idx != NULL)
				*fault_idx = i;
			return -1;
		}

	return 0;
}

cudaError_t do_test_memcpy(size_t elems_numb)
{
	cudaError_t err = cudaSuccess;
	float *dev_arr = NULL, *host_arr = NULL;
	size_t fault_idx = 0;

	err = cudaMalloc(&dev_arr, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMalloc()' failed: %s\n", cudaGetErrorString(err));
		goto out;
	}

	/*
	 * Test steps:
	 * 1) memset(0) dev buf, fill host buf, copy host buf to dev, test dev buf data
	 * 2) memset(0) host buf, copy buf from dev to host, test host buf data
	 */
	// 1)
	err = cudaMemset(dev_arr, 0, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMemset()' failed: %s\n", cudaGetErrorString(err));
		goto dev_malloc_out;
	}

	host_arr = (float *)malloc(ARR_SIZE(elems_numb));
	if (host_arr == NULL) {
		eprintfl("'malloc()' failed: %s\n", strerror(errno));
		err = cudaErrorUnknown;
		goto dev_malloc_out;
	}

	fill_host_buffer(host_arr, elems_numb, ELEM_VALUE);

	err = cudaMemcpy(dev_arr, host_arr, ARR_SIZE(elems_numb), cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		eprintfl("'cudaMemcpy()' HostToDevice failed: %s\n", cudaGetErrorString(err));
		goto host_malloc_out;
	}

	check_dev_buffer<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(dev_arr, elems_numb, ELEM_VALUE);

	err = cudaDeviceSynchronize();
	if (err == cudaErrorAssert) {
		eprintfl("'check_dev_buffer()' kernel failed, data copied to gpu differs from original one (%f): %s\n",
				ELEM_VALUE, cudaGetErrorString(err));
		goto host_malloc_out;
	} else if (err != cudaSuccess) {
		eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));
		goto host_malloc_out;
	}

	// 2)
	memset(host_arr, 0, ARR_SIZE(elems_numb));

	err = cudaMemcpy(host_arr, dev_arr, ARR_SIZE(elems_numb), cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
		eprintfl("'cudaMemcpy()' DeviceToHost failed: %s\n", cudaGetErrorString(err));
		goto host_malloc_out;
	}

	if (check_host_buffer(host_arr, elems_numb, ELEM_VALUE, &fault_idx) != 0) {
		eprintfl("'check_host_buffer()' failed: data[%zu]=%f copied from gpu differs from original one: %f\n",
				fault_idx, host_arr[fault_idx], ELEM_VALUE);
		err = cudaErrorUnknown;
		goto host_malloc_out;
	}

host_malloc_out:
	free(host_arr);

dev_malloc_out:
	if (cudaFree(dev_arr) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed: %s\n", cudaGetErrorString(err));
	}

out:
	return err;
}

static __global__ void saxpy(size_t n, float a, const float *x, float *y)
{
	size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
	size_t stride = (size_t)blockDim.x * gridDim.x;

	for (size_t i = index; i < n; i += stride)
		y[i] = a * x[i] + y[i];
}

cudaError_t do_test_saxpy_unified(size_t elems_numb)
{
	cudaError_t err = cudaSuccess;
	float *vec_x = NULL, *vec_y = NULL;
	size_t fault_idx = 0;

	err = cudaMallocManaged(&vec_x, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMallocManaged()' failed for 'vec_x': %s\n", cudaGetErrorString(err));
		goto out;
	}

	err = cudaMallocManaged(&vec_y, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMallocManaged()' failed for 'vec_y': %s\n", cudaGetErrorString(err));
		goto vec_x_malloc_out;
	}

	fill_host_buffer(vec_x, elems_numb, SAXPY_VEC_X_ELEM);
	fill_host_buffer(vec_y, elems_numb, SAXPY_VEC_Y_ELEM);

	saxpy<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(elems_numb, SAXPY_ALPHA, vec_x, vec_y);

	err = cudaDeviceSynchronize();
	if (err != cudaSuccess) {
		eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));
		goto vec_y_malloc_out;
	}

	if (check_host_buffer(vec_y, elems_numb, SAXPY_RES_ELEM, &fault_idx) != 0) {
		eprintfl("'check_host_buffer()' failed: data[%zu]=%f calculated on gpu differs from reference one: %f\n",
				fault_idx, vec_y[fault_idx], SAXPY_RES_ELEM);
		err = cudaErrorUnknown;
		goto vec_y_malloc_out;
	}

vec_y_malloc_out:
	if (cudaFree(vec_y) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed 'vec_y': %s\n", cudaGetErrorString(err));
	}

vec_x_malloc_out:
	if (cudaFree(vec_x) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed for 'vec_x': %s\n", cudaGetErrorString(err));
	}

out:
	return err;
}

/*
static inline int __is_multiple(size_t divident, size_t divisor)
{
	size_t remainder = divident % divisor;

	if (remainder) {
		eprintfl("WARNING: divident=%zu is not a multiple of divisor=%zu, remainder: %zu\n",
				divident, divisor, remainder);
		return 0;
	}

	return 1;
}
*/

static __global__ void double_sliced(size_t n, float *x, size_t slice_idx, size_t slice_elems_numb)
{
	size_t offset = 0;
	size_t upper_bound = n;

	if (slice_elems_numb != 0) {
		offset = slice_elems_numb * slice_idx;
		upper_bound = min(slice_elems_numb * (slice_idx + 1), n);
	}

	size_t index = offset + (size_t)blockIdx.x * blockDim.x + threadIdx.x;
	size_t stride = (size_t)blockDim.x * gridDim.x;

	for (size_t i = index; i < upper_bound; i += stride)
		x[i] *= 2;
}

cudaError_t do_test_host_async(size_t elems_numb, int batched)
{
	cudaError_t err = cudaSuccess;
	float *host_arr = NULL, *dev_arr = NULL;
	size_t fault_idx = 0;
	cudaStream_t streams[STREAMS_NUMB];

	for (size_t i = 0; i < STREAMS_NUMB; i++) {
		err = cudaStreamCreate(&streams[i]);
		if (err != cudaSuccess) {
			eprintfl("'cudaStreamCreate()' failed for idx=%zu: %s\n", i, cudaGetErrorString(err));
			fault_idx = i;
			goto streams_out;
		}
	}

	err = cudaMallocHost(&host_arr, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMallocHost()' failed for 'host_arr': %s\n", cudaGetErrorString(err));
		goto streams_all_out;
	}

	fill_host_buffer(host_arr, elems_numb, ELEM_VALUE);

	err = cudaMalloc(&dev_arr, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMalloc()' failed for 'dev_arr': %s\n", cudaGetErrorString(err));
		goto host_malloc_out;
	}

	// __is_multiple(ARR_SIZE(elems_numb), STREAMS_NUMB); // SLICE_SIZE
	// __is_multiple(elems_numb, STREAMS_NUMB); // SLICE_ELEMS_NUMB
	if (batched) {
		for (size_t i = 0; i < STREAMS_NUMB; i++) {
			size_t offset = i * SLICE_ELEMS_NUMB(elems_numb);
			err = cudaMemcpyAsync(dev_arr + offset, host_arr + offset, min(SLICE_SIZE(elems_numb), (elems_numb - offset) * sizeof(DOUBLED_ELEM_VALUE)), cudaMemcpyHostToDevice, streams[i]);
			if (err != cudaSuccess) {
				eprintfl("'cudaMemcpyAsync()' HostToDevice failed: %s\n", cudaGetErrorString(err));
				goto dev_malloc_out;
			}
		}

		for (size_t i = 0; i < STREAMS_NUMB; i++) {
			double_sliced<<<blocks_numb_sliced(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE, 0, streams[i]>>>(elems_numb, dev_arr, i, SLICE_ELEMS_NUMB(elems_numb));
		}

		for (size_t i = 0; i < STREAMS_NUMB; i++) {
			size_t offset = i * SLICE_ELEMS_NUMB(elems_numb);
			err = cudaMemcpyAsync(host_arr + offset, dev_arr + offset, min(SLICE_SIZE(elems_numb), (elems_numb - offset) * sizeof(DOUBLED_ELEM_VALUE)), cudaMemcpyDeviceToHost, streams[i]);
			if (err != cudaSuccess) {
				eprintfl("'cudaMemcpyAsync()' DeviceToHost failed: %s\n", cudaGetErrorString(err));
				goto dev_malloc_out;
			}
		}
	} else {
		for (size_t i = 0; i < STREAMS_NUMB; i++) {
			size_t offset = i * SLICE_ELEMS_NUMB(elems_numb);

			err = cudaMemcpyAsync(dev_arr + offset, host_arr + offset, min(SLICE_SIZE(elems_numb), (elems_numb - offset) * sizeof(DOUBLED_ELEM_VALUE)), cudaMemcpyHostToDevice, streams[i]);
			if (err != cudaSuccess) {
				eprintfl("'cudaMemcpyAsync()' HostToDevice failed: %s\n", cudaGetErrorString(err));
				goto dev_malloc_out;
			}

			double_sliced<<<blocks_numb_sliced(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE, 0, streams[i]>>>(elems_numb, dev_arr, i, SLICE_ELEMS_NUMB(elems_numb));

			err = cudaMemcpyAsync(host_arr + offset, dev_arr + offset, min(SLICE_SIZE(elems_numb), (elems_numb - offset) * sizeof(DOUBLED_ELEM_VALUE)), cudaMemcpyDeviceToHost, streams[i]);
			if (err != cudaSuccess) {
				eprintfl("'cudaMemcpyAsync()' DeviceToHost failed: %s\n", cudaGetErrorString(err));
				goto dev_malloc_out;
			}
		}
	}

	err = cudaDeviceSynchronize();
	if (err != cudaSuccess) {
		eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));
		goto dev_malloc_out;
	}

	if (check_host_buffer(host_arr, elems_numb, DOUBLED_ELEM_VALUE, &fault_idx) != 0) {
		eprintfl("'check_host_buffer()' failed: data[%zu]=%f calculated on gpu differs from reference one: %f\n",
				fault_idx, host_arr[fault_idx], DOUBLED_ELEM_VALUE);
		err = cudaErrorUnknown;
		goto dev_malloc_out;
	}

dev_malloc_out:
	if (cudaFree(dev_arr) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed 'dev_arr': %s\n", cudaGetErrorString(err));
	}

host_malloc_out:
	if (cudaFreeHost(host_arr) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed for 'host_arr': %s\n", cudaGetErrorString(err));
	}

streams_all_out:
	fault_idx = STREAMS_NUMB;
streams_out:
	for (size_t i = 0; i < fault_idx; i++) {
		if (cudaStreamDestroy(streams[i]) != cudaSuccess) {
			err = cudaPeekAtLastError();
			eprintfl("'cudaStreamDestroy()' failed for idx=%zu: %s\n", i, cudaGetErrorString(err));
		}
	}

	return err;
}

static __global__ void fill_dev_buffer(float *buf, size_t elems_numb, float val)
{
	size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
	size_t stride = (size_t)blockDim.x * gridDim.x;

	for (size_t i = index; i < elems_numb; i += stride)
		buf[i] = val;
}

cudaError_t do_test_p2p(size_t elems_numb, size_t cur_gpu, int gpu_count)
{
	cudaError_t err = cudaSuccess;
	size_t next_gpu = (cur_gpu + 1) % gpu_count;
	float *cur_arr = NULL, *next_arr = NULL;

	err = cudaMalloc(&cur_arr, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMalloc()' failed for 'cur_arr': %s\n", cudaGetErrorString(err));
		goto out;
	}

	fill_dev_buffer<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(cur_arr, elems_numb, ELEM_VALUE);

	err = cudaSetDevice(next_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaSetDevice(%zu)' failed: %s\n", next_gpu, cudaGetErrorString(err));
		goto cur_malloc_out;
	}

	err = cudaMalloc(&next_arr, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMalloc()' failed for 'next_arr': %s\n", cudaGetErrorString(err));
		goto cur_malloc_out;
	}

	err = cudaMemcpyPeer(next_arr, next_gpu, cur_arr, cur_gpu, ARR_SIZE(elems_numb));
	if (err != cudaSuccess) {
		eprintfl("'cudaMemcpyPeer()' failed: %s\n", cudaGetErrorString(err));
		goto next_malloc_out;
	}

	check_dev_buffer<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(next_arr, elems_numb, ELEM_VALUE);

	err = cudaDeviceSynchronize();
	if (err == cudaErrorAssert) {
		eprintfl("'check_dev_buffer()' kernel failed, data copied to second gpu differs from original one (%f): %s\n",
				ELEM_VALUE, cudaGetErrorString(err));
		goto next_malloc_out;
	} else if (err != cudaSuccess) {
		eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));
		goto next_malloc_out;
	}

next_malloc_out:
	if (cudaFree(next_arr) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed 'next_arr': %s\n", cudaGetErrorString(err));
	}

cur_malloc_out:
	if (cudaFree(cur_arr) != cudaSuccess) {
		err = cudaPeekAtLastError();
		eprintfl("'cudaFree()' failed for 'cur_arr': %s\n", cudaGetErrorString(err));
	}

out:
	return err;
}

// if some error returned - access is not enabled
cudaError_t __enable_p2p_access(size_t from_gpu, size_t to_gpu)
{
	cudaError_t err = cudaSuccess;
	int can_access = 0;

	err = cudaSetDevice(from_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaSetDevice(%zu)' failed: %s\n",
				from_gpu, cudaGetErrorString(err));
		goto out;
	}

	err = cudaDeviceCanAccessPeer(&can_access, from_gpu, to_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaDeviceCanAccessPeer(%zu -> %zu)' failed: %s\n",
				from_gpu, to_gpu, cudaGetErrorString(err));
		goto out;
	}

	if (!can_access) {
		eprintfl("'cudaDeviceCanAccessPeer(%zu -> %zu)': can't access gpu#%zu from gpu#%zu\n",
				from_gpu, to_gpu, to_gpu, from_gpu);
		err = cudaErrorUnknown;
		goto out;
	}

	err = cudaDeviceEnablePeerAccess(to_gpu, 0);
	if (err != cudaSuccess) {
		eprintfl("'cudaDeviceEnablePeerAccess(%zu -> %zu)' failed: %s\n",
				from_gpu, to_gpu, cudaGetErrorString(err));
		goto out;
	}

out:
	return err;
}

// if some error returned - access is not disabled
cudaError_t __disable_p2p_access(size_t from_gpu, size_t to_gpu)
{
	cudaError_t err = cudaSuccess;

	err = cudaSetDevice(from_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaSetDevice(%zu)' failed: %s\n", from_gpu, cudaGetErrorString(err));
		goto out;
	}

	err = cudaDeviceDisablePeerAccess(to_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaDeviceDisablePeerAccess(%zu -> %zu)' failed: %s\n", from_gpu, to_gpu, cudaGetErrorString(err));
		goto out;
	}

out:
	return err;
}

// in case of error - try our best to disable access back
cudaError_t enable_p2p_access_both(size_t cur_gpu, size_t next_gpu)
{
	cudaError_t err = cudaSuccess;

	err = __enable_p2p_access(cur_gpu, next_gpu);
	if (err != cudaSuccess) {
		eprintfl("'__enable_p2p_access(%zu -> %zu)' failed for 'cur_gpu' -> 'next_gpu': %s\n",
				cur_gpu, next_gpu, cudaGetErrorString(err));
		goto out;
	}

	err = __enable_p2p_access(next_gpu, cur_gpu);
	if (err != cudaSuccess) {
		eprintfl("'__enable_p2p_access(%zu -> %zu)' failed for 'next_gpu' -> 'cur_gpu': %s\n",
				next_gpu, cur_gpu, cudaGetErrorString(err));
		goto cur_access_out;
	}

	err = cudaSetDevice(cur_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaSetDevice(%zu)' failed to switch to 'cur_gpu': %s\n", cur_gpu, cudaGetErrorString(err));
		goto next_access_out;
	}

	return err;

	// in case of error:
	// try to disable access back, but don't handle possible errors as this is best effort
	// and not disabling p2p access doesn't seem to lead to issues
	//
	// even if errors will happen below, return error from main code above
next_access_out:
	__disable_p2p_access(next_gpu, cur_gpu);

cur_access_out:
	__disable_p2p_access(cur_gpu, next_gpu);

out:
	return err;
}

// in case of error - try our best to enable access back
cudaError_t disable_p2p_access_both(size_t cur_gpu, size_t next_gpu)
{
	cudaError_t err = cudaSuccess;

	err = __disable_p2p_access(cur_gpu, next_gpu);
	if (err != cudaSuccess) {
		eprintfl("'__disable_p2p_access(%zu -> %zu)' failed for 'cur_gpu' -> 'next_gpu': %s\n",
				cur_gpu, next_gpu, cudaGetErrorString(err));
		goto out;
	}

	err = __disable_p2p_access(next_gpu, cur_gpu);
	if (err != cudaSuccess) {
		eprintfl("'__disable_p2p_access(%zu -> %zu)' failed for 'next_gpu' -> 'cur_gpu': %s\n",
				next_gpu, cur_gpu, cudaGetErrorString(err));
		goto cur_access_out;
	}

	err = cudaSetDevice(cur_gpu);
	if (err != cudaSuccess) {
		eprintfl("'cudaSetDevice(%zu)' failed to switch to 'cur_gpu': %s\n", cur_gpu, cudaGetErrorString(err));
		goto next_access_out;
	}

	return err;

	// in case of error:
	// try to enable access back, but don't handle possible errors as this is best effort
	//
	// even if errors will happen below, return error from main code above
next_access_out:
	__enable_p2p_access(next_gpu, cur_gpu);

cur_access_out:
	__enable_p2p_access(cur_gpu, next_gpu);

out:
	return err;
}

static __global__ void copyp2p(size_t n, float *dst, const float *src)
{
	size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
	size_t stride = (size_t)blockDim.x * gridDim.x;

	for (size_t i = index; i < n; i += stride)
		dst[i] = src[i];
}

enum p2p_bandwidth_mode {
	P2P_BANDWIDTH_MEMCPY,
	P2P_BANDWIDTH_KERNEL,
	P2P_BANDWIDTH_MEMCPY_PEER
};

cudaError_t do_test_p2p_bandwidth(size_t elems_numb, size_t cur_gpu, int gpu_count, enum p2p_bandwidth_mode p2p_mode, int enable_p2p_access, ssize_t threshold)
{
	cudaError_t err = cudaSuccess;

	size_t next_gpu = (cur_gpu + 1) % gpu_count;
	size_t gpus[] = {cur_gpu, next_gpu};

	float *cur_arr = NULL, *next_arr = NULL;
	float **arrs[] = {&cur_arr, &next_arr};

	cudaEvent_t start, stop;

	float ms = 0;
	double sec = 0;
	double total_size = 0;
	double bandwidth = 0;


	printf("\n");
	for (size_t next_gpu_idx = 0; next_gpu_idx < (size_t)gpu_count; next_gpu_idx++) {
		next_gpu = (cur_gpu + 1 + next_gpu_idx) % gpu_count;
		gpus[1] = next_gpu;

		// NOTE: if needed, handle intergpu case here
		if (cur_gpu == next_gpu)
			continue;

		printf("copying gpu#%zu <-> gpu#%zu: ", cur_gpu, next_gpu);

		for (size_t i = 0; i < 2; i++) {
			err = cudaSetDevice(gpus[i % 2]);
			if (err != cudaSuccess) {
				eprintfl("'cudaSetDevice(%zu)' failed for '%s': %s\n",
						gpus[i % 2], (i) ? "cur_gpu" : "next_gpu", cudaGetErrorString(err));
				goto mallocs_out;
			}

			err = cudaMalloc(arrs[i], ARR_SIZE(elems_numb));
			if (err != cudaSuccess) {
				eprintfl("'cudaMalloc()' failed for '%s': %s\n", (!i) ? "cur_arr" : "next_arr", cudaGetErrorString(err));
				goto mallocs_out;
			}
		}

		if (enable_p2p_access) {
			err = enable_p2p_access_both(cur_gpu, next_gpu);
			if (err != cudaSuccess) {
				eprintfl("'enable_p2p_access_both(%zu -> %zu)' failed: %s\n", cur_gpu, next_gpu, cudaGetErrorString(err));
				goto mallocs_out;
			}
		}

		err = cudaSetDevice(cur_gpu);
		if (err != cudaSuccess) {
			eprintfl("'cudaSetDevice(%zu)' failed for 'cur_gpu': %s\n", cur_gpu, cudaGetErrorString(err));
			goto stop_event_out;
		}
		err = cudaEventCreate(&start);
		if (err != cudaSuccess) {
			eprintfl("'cudaEventCreate(start)' failed: %s\n", cudaGetErrorString(err));
			goto enable_access_out;
		}
		err = cudaEventCreate(&stop);
		if (err != cudaSuccess) {
			eprintfl("'cudaEventCreate(stop)' failed: %s\n", cudaGetErrorString(err));
			goto start_event_out;
		}

		fill_dev_buffer<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(cur_arr, elems_numb, ELEM_VALUE);

		//=====
		err = cudaEventRecord(start);
		if (err != cudaSuccess) {
			eprintfl("'cudaEventRecord(start)' failed: %s\n", cudaGetErrorString(err));
			goto stop_event_out;
		}
		//-----
		for (size_t i = 0; i < REPETITION_COUNT; i++) {
			switch (p2p_mode)
			{
			case P2P_BANDWIDTH_MEMCPY:
				err = cudaMemcpy(*arrs[(i + 1) % 2], *arrs[i % 2], ARR_SIZE(elems_numb), (enable_p2p_access) ? cudaMemcpyDefault : cudaMemcpyDeviceToDevice);
				if (err != cudaSuccess) {
					eprintfl("'cudaMemcpy()' failed copying arrays from gpu#%zu to gpu#%zu: %s\n",
							gpus[i % 2], gpus[(i + 1) % 2], cudaGetErrorString(err));
					goto stop_event_out;
				}
				break;

			case P2P_BANDWIDTH_KERNEL:
				copyp2p<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(elems_numb, *arrs[(i + 1) % 2], *arrs[i % 2]);
				break;

			case P2P_BANDWIDTH_MEMCPY_PEER:
				err = cudaMemcpyPeer(*arrs[(i + 1) % 2], gpus[(i + 1) % 2], *arrs[i % 2], gpus[i % 2], ARR_SIZE(elems_numb));
				if (err != cudaSuccess) {
					eprintfl("'cudaMemcpyPeer()' failed copying arrays from gpu#%zu to gpu#%zu: %s\n",
							gpus[i % 2], gpus[(i + 1) % 2], cudaGetErrorString(err));
					goto stop_event_out;
				}
				break;

			default:
				eprintfl("Unknown p2p bandwidth mode\n");
				goto stop_event_out;
			}

			if (p2p_mode != P2P_BANDWIDTH_KERNEL) {
				err = cudaSetDevice(gpus[(i + 1) % 2]);
				if (err != cudaSuccess) {
					eprintfl("'cudaSetDevice(%zu)' failed for '%s': %s\n",
							gpus[(i + 1) % 2], (i) ? "cur_gpu" : "next_gpu", cudaGetErrorString(err));
					goto stop_event_out;
				}
			}
		}
		//-----
		err = cudaSetDevice(cur_gpu);
		if (err != cudaSuccess) {
			eprintfl("'cudaSetDevice(%zu)' failed for 'cur_gpu': %s\n", cur_gpu, cudaGetErrorString(err));
			goto stop_event_out;
		}
		if (cudaEventRecord(stop) != cudaSuccess) {
			err = cudaPeekAtLastError();
			eprintfl("'cudaEventRecord(stop)' failed: %s\n", cudaGetErrorString(err));
			goto stop_event_out;
		}
		//=====
		err = cudaEventSynchronize(stop);
		if (err != cudaSuccess) {
			eprintfl("'cudaEventSynchronize(stop)' failed: %s\n", cudaGetErrorString(err));
			goto stop_event_out;
		}

		err = cudaEventElapsedTime(&ms, start, stop);
		if (err != cudaSuccess) {
			eprintfl("'cudaEventElapsedTime(start, stop)' failed: %s\n", cudaGetErrorString(err));
			goto stop_event_out;
		}

		total_size = ARR_SIZE(elems_numb) * REPETITION_COUNT;
		sec = ms / 1000;
		bandwidth = total_size / sec;
		// printf("Size (bytes): %f, time (sec): %f, bandwidth (GB/s): %f\n", total_size, sec, total_size / sec / GB);
		printf("%f GB/s\n", bandwidth / GB);

		if (threshold != NO_THRESHOLD) {
			if (bandwidth < (double)threshold) {
				eprintfl("bandwidth is lower than threshold: %f < %f\n", bandwidth, (double)threshold);
				err = cudaErrorUnknown;
				goto stop_event_out;
			}
		}

		// NOTE: be sure to check proper array
		err = cudaSetDevice(gpus[REPETITION_COUNT % 2]);
		if (err != cudaSuccess) {
			eprintfl("'cudaSetDevice(%d)' failed: %s\n", REPETITION_COUNT % 2, cudaGetErrorString(err));
			goto stop_event_out;
		}
		check_dev_buffer<<<blocks_numb(elems_numb, MAX_BLOCK_SIZE), MAX_BLOCK_SIZE>>>(*arrs[REPETITION_COUNT % 2], elems_numb, ELEM_VALUE);

		err = cudaDeviceSynchronize();
		if (err == cudaErrorAssert) {
			eprintfl("'check_dev_buffer()' kernel failed, data copied to second gpu differs from original one (%f): %s\n",
					ELEM_VALUE, cudaGetErrorString(err));
			goto stop_event_out;
		} else if (err != cudaSuccess) {
			eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));
			goto stop_event_out;
		}

stop_event_out:
		if (cudaEventDestroy(stop) != cudaSuccess) {
			err = cudaPeekAtLastError();
			eprintfl("'cudaEventDestroy(stop)' failed: %s\n", cudaGetErrorString(err));
		}

start_event_out:
		if (cudaEventDestroy(start) != cudaSuccess) {
			err = cudaPeekAtLastError();
			eprintfl("'cudaEventDestroy(start)' failed: %s\n", cudaGetErrorString(err));
		}

enable_access_out:
		if (enable_p2p_access) {
			if (disable_p2p_access_both(cur_gpu, next_gpu) != cudaSuccess) {
				err = cudaPeekAtLastError();
				eprintfl("'disable_p2p_access_both(%zu -> %zu)' failed: %s\n", cur_gpu, next_gpu, cudaGetErrorString(err));
			}
		}

		// If devPtr is 0, no operation is performed
mallocs_out:
		for (size_t i = 0; i < 2; i++) {
			if (cudaSetDevice(gpus[i]) != cudaSuccess) {
				err = cudaPeekAtLastError();
				eprintfl("'cudaSetDevice(%zu)' failed: %s\n", i, cudaGetErrorString(err));
			}

			if (cudaFree(*arrs[i]) != cudaSuccess) {
				err = cudaPeekAtLastError();
				eprintfl("'cudaFree()' failed for '%s': %s\n", (!i) ? "cur_arr" : "next_arr", cudaGetErrorString(err));
			}
			*arrs[i] = NULL;
		}

		if (err != cudaSuccess)
			return err;
	}

	return err;
}

cudaError_t start_perf_test(enum test_numbs test_numb, size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold)
{
	cudaError_t err = cudaSuccess;

	if (gpu_idx > (gpu_count - 1)) {
		eprintfl("'start_test()' error: gpu_idx=%zd greater or equal than gpu_count=%d\n", gpu_idx, gpu_count);
		return cudaErrorUnknown;
	}

	if (run_on_all_gpus)
		gpu_idx = 0;
	else
		gpu_count = gpu_idx + 1;

	for (size_t i = gpu_idx; i < (size_t)gpu_count; i++) {
		printf("running test on gpu #%zu: ", i);

		err = cudaSetDevice(i);
		if (err != cudaSuccess) {
			eprintfl("'cudaSetDevice(%zu)' failed: %s\n", i, cudaGetErrorString(err));
			return err;
		}

		switch (test_numb) {

		// functional tests
		case NR_test_memcpy:
			err = do_test_memcpy(elems_numb);
			break;

		case NR_test_saxpy_unified:
			err = do_test_saxpy_unified(elems_numb);
			break;

		case NR_test_host_async:
			err = do_test_host_async(elems_numb, 0);
			break;

		case NR_test_host_async_batched:
			err = do_test_host_async(elems_numb, 1);
			break;

		case NR_test_p2p:
			if (!run_on_all_gpus || (gpu_count < 2)) {
				eprintfl("running 'test_p2p' makes sense on at least two gpus\n");
				err = cudaSuccess;
				break;
			}

			printf("copying to gpu #%zu: ", (i + 1) % gpu_count);
			err = do_test_p2p(elems_numb, i, gpu_count);
			break;

		case NR_test_dev_sync:
			// cudaSetDevice(i) was called before switch-case, so call only cudaDeviceSynchronize() here
			err = cudaDeviceSynchronize();
			if (err != cudaSuccess)
				eprintfl("'cudaDeviceSynchronize()' failed: %s\n", cudaGetErrorString(err));

			break;

		// performance tests - need to specify test explicitly to run
		case NR_test_p2p_bandwidth:
			if (!run_on_all_gpus || (gpu_count < 2)) {
				eprintfl("running 'test_p2p_bandwidth' makes sense on at least two gpus\n");
				err = cudaSuccess;
				break;
			}

			err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_MEMCPY, 1, threshold);
			// err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_MEMCPY, 0, threshold);
			break;

		case NR_test_p2p_bandwidth_p2p_off:
			if (!run_on_all_gpus || (gpu_count < 2)) {
				eprintfl("running 'test_p2p_bandwidth_p2p_off' makes sense on at least two gpus\n");
				err = cudaSuccess;
				break;
			}

			// err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_MEMCPY_PEER, 1, threshold);
			err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_MEMCPY_PEER, 0, threshold);
			break;

		case NR_test_p2p_bandwidth_kernel:
			if (!run_on_all_gpus || (gpu_count < 2)) {
				eprintfl("running 'test_p2p_bandwidth_kernel' makes sense on at least two gpus\n");
				err = cudaSuccess;
				break;
			}

			err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_KERNEL, 1, threshold);
			// kernel doesn't work without p2p enabled
			// err = do_test_p2p_bandwidth(elems_numb, i, gpu_count, P2P_BANDWIDTH_KERNEL, 0, threshold);
			break;

		// add new 'test_*' above

		default:
			eprintfl("'start_test()' failed: unknown test number: %d\n", test_numb);
			err = cudaErrorUnknown;
			break;
		}

		if (err != cudaSuccess) {
			eprintfl("test failed on gpu #%zu\n", i);
			break;
		}

		printf("OK\n");
	}

	return err;
}

cudaError_t start_test(enum test_numbs test_numb, size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_perf_test(test_numb, elems_numb, gpu_idx, gpu_count, run_on_all_gpus, 0);
}

// functional tests
cudaError_t test_memcpy(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_memcpy, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

cudaError_t test_saxpy_unified(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_saxpy_unified, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

cudaError_t test_host_async(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_host_async, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

cudaError_t test_host_async_batched(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_host_async_batched, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

cudaError_t test_p2p(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_p2p, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

cudaError_t test_dev_sync(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus)
{
	return start_test(NR_test_dev_sync, elems_numb, gpu_idx, gpu_count, run_on_all_gpus);
}

// performance tests - need to specify test explicitly to run
cudaError_t test_p2p_bandwidth(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold)
{
	return start_perf_test(NR_test_p2p_bandwidth, elems_numb, gpu_idx, gpu_count, run_on_all_gpus, threshold);
}

cudaError_t test_p2p_bandwidth_p2p_off(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold)
{
	return start_perf_test(NR_test_p2p_bandwidth_p2p_off, elems_numb, gpu_idx, gpu_count, run_on_all_gpus, threshold);
}

cudaError_t test_p2p_bandwidth_kernel(size_t elems_numb, ssize_t gpu_idx, int gpu_count, int run_on_all_gpus, ssize_t threshold)
{
	return start_perf_test(NR_test_p2p_bandwidth_kernel, elems_numb, gpu_idx, gpu_count, run_on_all_gpus, threshold);
}

// add new 'test_*' above
