From 7d1cc671439ad1384f6eeba57215ccb2824ad9ca Mon Sep 17 00:00:00 2001
From: MitchellHansen <mitchellhansen0@gmail.com>
Date: Wed, 23 Dec 2015 00:22:02 -0800
Subject: [PATCH] I thiiiiink that the gpu is computing things correctly now,
 but I'm having trouble getting data back out of the gpu. More tomorrow

---
 Conway_OpenCL/HelloWorld.cpp       | 203 ++++++++++++++++-------------
 Conway_OpenCL/HelloWorld_Kernel.cl |  22 ++--
 2 files changed, 122 insertions(+), 103 deletions(-)
diff --git a/Conway_OpenCL/HelloWorld.cpp b/Conway_OpenCL/HelloWorld.cpp
index 81a962c..c47cb12 100644
--- a/Conway_OpenCL/HelloWorld.cpp
+++ b/Conway_OpenCL/HelloWorld.cpp
@@ -8,11 +8,27 @@
 #include <random>
 #include <ctime>
 #include <SFML/Graphics.hpp>
+#include <windows.h>
 
 #define SUCCESS 0
 #define FAILURE 1
 
-using namespace std;
+
+
+float elap_time() {
+	static __int64 start = 0;
+	static __int64 frequency = 0;
+
+	if (start == 0) {
+		QueryPerformanceCounter((LARGE_INTEGER*)&start);
+		QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);
+		return 0.0f;
+	}
+
+	__int64 counter = 0;
+	QueryPerformanceCounter((LARGE_INTEGER*)&counter);
+	return (float)((counter - start) / double(frequency));
+}
 
 /* convert the kernel file into a string */
 int convertToString(const char *filename, std::string& s)
@@ -41,43 +57,30 @@ int convertToString(const char *filename, std::string& s)
 		delete[] str;
 		return 0;
 	}
-	cout<<"Error: failed to open file\n:"<<filename<<endl;
+	std::cout << "Error: failed to open file\n:" << filename << std::endl;
 	return FAILURE;
 }
 
 int main(int argc, char* argv[])
 {
+	int WINDOW_X = 1000;
+	int WINDOW_Y = 1000;
+	int GRID_WIDTH = 1000;
+	int GRID_HEIGHT = 1000;
+	int WORKER_SIZE = 1000;
 
-	
-	// 1000 x 1000 grid
-	std::mt19937 rng(time(NULL));
-	std::uniform_int_distribution<int> rgen(0, 4); // 25% chance
-
-	char* grid = new char[1000 * 1000 * 2];
-
-	for (int i = 0; i < 1000 * 1000 * 2; i += 2) {
-		if (rgen(rng) == 1) {
-			grid[i] = 1;
-			grid[i + 1] = 1;
-		}
-		else {
-			grid[i] = 0;
-			grid[i + 1] = 0;
-		}
-	}
-
+	// ============================== OpenCL Setup ==================================================================
 
 	/*Step1: Getting platforms and choose an available one.*/
 	cl_uint numPlatforms;	//the NO. of platforms
 	cl_platform_id platform = NULL;	//the chosen platform
 	cl_int	status = clGetPlatformIDs(0, NULL, &numPlatforms);
-	if (status != CL_SUCCESS)
-	{
-		cout << "Error: Getting platforms!" << endl;
+	if (status != CL_SUCCESS) {
+		std::cout << "Error: Getting platforms!" << std::endl;
 		return FAILURE;
 	}
 
-	/*For clarity, choose the first available platform. */
+	 // Choose the first available platform
 	if(numPlatforms > 0)
 	{
 		cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
@@ -90,16 +93,14 @@ int main(int argc, char* argv[])
 	cl_uint				numDevices = 0;
 	cl_device_id        *devices;
 	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);	
-	if (numDevices == 0)	//no GPU available.
-	{
-		cout << "No GPU device available." << endl;
-		cout << "Choose CPU as default device." << endl;
+	if (numDevices == 0) { //no GPU available.
+		std::cout << "No GPU device available." << std::endl;
+		std::cout << "Choose CPU as default device." << std::endl;
 		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);	
 		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
 		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
 	}
-	else
-	{
+	else {
 		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
 		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
 	}
@@ -111,51 +112,73 @@ int main(int argc, char* argv[])
 	/*Step 4: Creating command queue associate with the context.*/
 	cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
 
+
+	// ============================== Kernel Compilation, Setup ====================================================
+
 	/*Step 5: Create program object */
 	const char *filename = "HelloWorld_Kernel.cl";
-	string sourceStr;
+	std::string sourceStr;
 	status = convertToString(filename, sourceStr);
 	const char *source = sourceStr.c_str();
 	size_t sourceSize[] = {strlen(source)};
 	cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
 	
-	/*Step 6: Build program. */
+	// Build program and set kernel
 	status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
 
-	/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
-	const char* input = "GdkknVnqkc";
-	size_t strlength = strlen(input);
-	cout << "input string:" << endl;
-	cout << input << endl;
-	char *output = (char*) malloc(strlength + 1);
+	if (status == CL_BUILD_PROGRAM_FAILURE) {
+		// Determine the size of the log
+		size_t log_size;
+		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 
-	cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (strlength + 1) * sizeof(char),(void *) input, NULL);
-	cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , (strlength + 1) * sizeof(char), NULL, NULL);
+		// Allocate memory for the log
+		char *log = (char *)malloc(log_size);
 
-	/*Step 8: Create kernel object */
-	cl_kernel kernel = clCreateKernel(program,"helloworld", NULL);
+		// Get the log
+		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
 
-	/*Step 9: Sets Kernel arguments.*/
-	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
-	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
-	
-	// ======================================= START SFML ==========================================================
+		// Print the log
+		printf("%s\n", log);
+	}
 
+	cl_kernel kernel = clCreateKernel(program, "helloworld", NULL);
+
+	// ======================================= Setup grid =========================================================
+
+	// Setup the rng
+	std::mt19937 rng(time(NULL));
+	std::uniform_int_distribution<int> rgen(0, 4); // 25% chance
+
+	// Init the grid 
+	char* grid = new char[GRID_WIDTH * GRID_HEIGHT* 2];
+
+	for (int i = 0; i < 1000 * 1000 * 2; i += 2) {
+		if (rgen(rng) == 1) {
+			grid[i] = 1;
+			grid[i + 1] = 1;
+		}
+		else {
+			grid[i] = 0;
+			grid[i + 1] = 0;
+		}
+	}
+
+	// ====================================== Setup SFML ==========================================================
 
 	// Spites for drawing, probably where the biggest slowdown is
 	sf::RectangleShape live_node;
 	live_node.setFillColor(sf::Color(145, 181, 207));
-	live_node.setSize(sf::Vector2f(WINDOW_X / Node::x_bound, WINDOW_Y / Node::y_bound));
+	live_node.setSize(sf::Vector2f(1, 1));
 
 	// Init window, and loop data
-	sf::RenderWindow window(sf::VideoMode(WINDOW_X, WINDOW_Y), "Classic Games");
+	sf::RenderWindow window(sf::VideoMode(GRID_WIDTH, GRID_HEIGHT), "Classic Games");
 
 	float step_size = 0.0005f;
 	double frame_time = 0.0, elapsed_time = 0.0, delta_time = 0.0, accumulator_time = 0.0, current_time = 0.0;
 	int frame_count = 0;
 
-	std::stack<std::thread> thread_stack;
 
+	// ===================================== Loop ==================================================================
 	while (window.isOpen()) {
 
 		sf::Event event;
@@ -165,7 +188,7 @@ int main(int argc, char* argv[])
 		}
 
 		// Time keeping
-		elapsed_time = elap_time();
+		//elapsed_time = elap_time();
 		delta_time = elapsed_time - current_time;
 		current_time = elapsed_time;
 		if (delta_time > 0.02f)
@@ -174,77 +197,73 @@ int main(int argc, char* argv[])
 
 		while ((accumulator_time - step_size) >= step_size) {
 			accumulator_time -= step_size;
-
 			// Do nothing, FPS tied update()
 		}
 
-		// Implicit dead node color
-		window.clear(sf::Color(49, 68, 72));
+		// ======================================= OpenCL Shtuff =============================================
 
-		for (int i = 0; i < 12; i++) {
-			thread_stack.emplace(updateRange, &node_vec, (node_vec.size() / 12)* i, (node_vec.size() / 12)* (i + 1));
-		}
-		while (!thread_stack.empty()) {
-			thread_stack.top().join();
-			thread_stack.pop();
-		}
+		int err = 0;
+		cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * 2 * sizeof(char), (void*)grid, &err);
+		cl_mem workerCountBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &WORKER_SIZE, &err);
+		cl_mem gridWidthBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_WIDTH, &err);
+		cl_mem gridHeightBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_HEIGHT, &err);
+
+		status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
+		status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&workerCountBuffer);
+		status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&gridWidthBuffer);
+		status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&gridHeightBuffer);
+
+		// One work item per group, don't really know if this impacts performance
+		size_t global_work_size[1] = { 1 };
 
-		//for (int i = 0; i < node_vec.size(); i++) {
-		//	node_vec.at(i).Update(&node_vec);
-		//}
+		// Run the kernel
+		status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
 
-		for (int i = 0; i < node_vec.size(); i++) {
-			node_vec[i].ShiftState();
+		// Get output, put back into grid
+		cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GRID_WIDTH * GRID_HEIGHT * 2 * sizeof(char), NULL, NULL);
+		status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, GRID_WIDTH * GRID_HEIGHT * 2 * sizeof(char), grid, 0, NULL, NULL);
+
+		// Temporary
+		status = clReleaseMemObject(inputBuffer);
+		status = clReleaseMemObject(workerCountBuffer);
+		status = clReleaseMemObject(gridWidthBuffer);
+		status = clReleaseMemObject(gridHeightBuffer);
+
+		// Swap status's
+		for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT * 2; i += 2) {
+			grid[i] = grid[i + 1];
 		}
 
-		for (int i = 0; i < node_vec.size(); i++) {
-			if (node_vec.at(i).CurrentState() == true) {
-				live_node.setPosition((i % Node::x_bound) * live_node.getGlobalBounds().width, (i / Node::x_bound) * live_node.getGlobalBounds().height);
+		for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT * 2; i += 2) {
+			if (!grid[i]) {
+				live_node.setPosition(sf::Vector2f((i % GRID_WIDTH) * (i / GRID_WIDTH), i / GRID_WIDTH));
 				window.draw(live_node);
 			}
-			else {
-				//dead_node.setPosition(i % Node::x_bound * dead_node.getGlobalBounds().width, i / Node::x_bound * dead_node.getGlobalBounds().height);
-				//window.draw(live_node);
-			}
 		}
 
+		// Implicit dead node color
+		window.clear(sf::Color(49, 68, 72));
+
+
 		frame_count++;
 		window.display();
 
+	}
 
-	// ======================================= END SFML ==========================================================
 
-	/*Step 10: Running the kernel.*/
-	size_t global_work_size[1] = {strlength};
-	status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
 
-	/*Step 11: Read the cout put back to host memory.*/
-	status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, strlength * sizeof(char), output, 0, NULL, NULL);
-	
-	output[strlength] = '\0';	//Add the terminal character to the end of output.
-	cout << "\noutput string:" << endl;
-	cout << output << endl;
 
 	/*Step 12: Clean the resources.*/
 	status = clReleaseKernel(kernel);				//Release kernel.
 	status = clReleaseProgram(program);				//Release the program object.
-	status = clReleaseMemObject(inputBuffer);		//Release mem object.
-	status = clReleaseMemObject(outputBuffer);
 	status = clReleaseCommandQueue(commandQueue);	//Release  Command queue.
 	status = clReleaseContext(context);				//Release context.
 
-	if (output != NULL)
-	{
-		free(output);
-		output = NULL;
-	}
-
 	if (devices != NULL)
 	{
 		free(devices);
 		devices = NULL;
 	}
 
-	std::cout<<"Passed!\n";
 	return SUCCESS;
 }
\ No newline at end of file
diff --git a/Conway_OpenCL/HelloWorld_Kernel.cl b/Conway_OpenCL/HelloWorld_Kernel.cl
index 22bb4bd..c3a884f 100644
--- a/Conway_OpenCL/HelloWorld_Kernel.cl
+++ b/Conway_OpenCL/HelloWorld_Kernel.cl
@@ -1,11 +1,11 @@
-__kernel void helloworld(__global char* in, __global int num_workers, __global int grid_width, __global int grid_height)
+__kernel void helloworld(__global char* in, __global int* num_workers, __global int* grid_width, __global int* grid_height)
 {
 	// Caclulate the start and end range that this worker will be calculating
 
-	int data_length = grid_width * grid_height;
+	int data_length = *grid_width * *grid_height;
 
-	int start_range = (data_length / num_workers) * get_global_id(0) * 2; // * 2 = padding
-	int end_range = (data_length / num_workers) * (get_global_id(0) + 1) * 2;
+	int start_range = (data_length / *num_workers) * get_global_id(0) * 2; // * 2 = padding
+	int end_range = (data_length / *num_workers) * (get_global_id(0) + 1) * 2;
 	
 	// x, y + 1
 
@@ -14,30 +14,30 @@ __kernel void helloworld(__global char* in, __global int num_workers, __global i
 	for (int i = start_range; i < end_range; i += 2){
 		
 		// add all 8 blocks to neghbors
-
+		neighbors = 0;
 		// Top
-		neighbors += in[i - grid_width * 2];
+		neighbors += in[i - *grid_width * 2];
 
 		// Top right
-		neightbors += in[i - grid_width * 2 + 2];
+		neighbors += in[i - *grid_width * 2 + 2];
 
 		// Right
 		neighbors += in[i + 2];
 
 		// Bottom Right
-		neighbors += in[i + grid_width * 2 + 2];
+		neighbors += in[i + *grid_width * 2 + 2];
 
 		// Bottom
-		neighbors += in[i + grid_width * 2];
+		neighbors += in[i + *grid_width * 2];
 
 		// Bottom Left
-		neighbors += in[i + grid_width * 2 - 2];
+		neighbors += in[i + *grid_width * 2 - 2];
 
 		// Left
 		neighbors += in[i - 2];
 
 		// Top left
-		neighbors += in[i - grid_width * 2 - 2];
+		neighbors += in[i - *grid_width * 2 - 2];
 
 		// push living status to the padded second char