#include <cstring>

#include <stdio.h>

#include <stdlib.h>

#include "cutil.h"

#include "cudpp/cudpp.h"

void cudppSort(unsigned int* data, unsigned int size, double* time, double* transferTime, bool merge)

{

	unsigned int timer, transferTimer;

	cutCreateTimer(&timer);
	cutCreateTimer(&transferTimer);

	cutStartTimer(transferTimer);


	unsigned int* ddata1 = 0;

	unsigned int* ddata2 = 0;

	size_t memSize = size * sizeof(unsigned int);



	cudaMalloc((void**) &ddata1, memSize);

	cudaMalloc((void**) &ddata2, memSize);

	cudaMemcpy(ddata1, data, memSize, cudaMemcpyHostToDevice);

	cutStopTimer(transferTimer);

	cudaThreadSynchronize();
	cutStartTimer(timer);



	CUDPPConfiguration config;



	if (merge)

		config.algorithm = CUDPP_SORT_RADIX;

	else

		config.algorithm = CUDPP_SORT_RADIX_GLOBAL;



	config.datatype = CUDPP_UINT;



	CUDPPHandle sortplan = 0;

	CUDPPResult result = cudppPlan(&sortplan, config, size, 1, 0);  



	cudppSort(sortplan, ddata2, ddata1, size);

	cudaThreadSynchronize();

	cutStopTimer(timer);

	cutStartTimer(transferTimer);



	cudaMemcpy(data, ddata2, memSize, cudaMemcpyDeviceToHost);  
	cudaFree(ddata1);

	cudaFree(ddata2);
	result = cudppDestroyPlan(sortplan);

	cutStopTimer(transferTimer);

	*time = cutGetTimerValue(timer);
	*transferTime = cutGetTimerValue(transferTimer);


	cutDeleteTimer(timer);

	cutDeleteTimer(transferTimer);

}


int main(int argc, char *argv[]) 

{
	
	int size = 10000000;
	float* data = new float[size];
	for (int i = 0; i < size; i++)
		data[i] = float(rand());

	double time = 0.0;
	double transferTime = 0.0;
	printf("\nCUDPP Radix with sync..\n");
	for (int i = 0; i < 5; i++)
	{
		cudppSort(data, size, &time, &transferTime);
		printf("Time: %f, transfer time: %f\n", time, transferTime);
		//time = 0.0; transferTime = 0.0;
	}

	delete [] data;

	return 0;	

}