OpenCL, C++: Unexpected Results of simple sum float vector program -


it simple program read 2 float4 vectors files calculate sum of opposite numbers. result of not expected!!

the main file:

#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <iomanip> #include <array> #include <fstream> #include <sstream> #include <string> #include <algorithm> #include <iterator>   #ifdef __apple__ #include <opencl/opencl.h> #else #include <cl/cl.h> #include <time.h> #endif    const int number_of_points = 16;  // number of points in both  , b files (number of rows) const int number_of_axis = 4;     // number of points axis in both  , b files (number of columns)   using namespace std;  void checkerror(cl_int err, const char *operation) {   if (err != cl_success)   {     fprintf(stderr, "error during operation '%s': %d\n", operation, err);     exit(1);   } }  int main(int argc, char *argv[]) {     clock_t tstart = clock();     // create 2 input vectors     // working variables     int i;     ifstream input_filea, input_fileb;  // input files     string line;    // transfer row file array     float x;        // transfer word file array     int row = 0;    // number of rows of file a,b (= array)     int col = 0;    // number of rows of file a,b (= array)      // working arrays      // working arrays //  int mem_size_tempa = number_of_points * number_of_axis * sizeof(cl_float); //  int mem_size_tempb = number_of_points * number_of_axis * sizeof(cl_float);      float tempaarray[number_of_points][number_of_axis]={{0}};   // array contains file data     float tempbarray[number_of_points][number_of_axis]={{0}};   // array contains file b data        int mem_size_inputa = number_of_points ;     int mem_size_inputb = number_of_points ;     int mem_size_output = number_of_points ;      float *inputaarray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file data     float *inputbarray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file b data     float *outputarray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file b data       // import input files     input_filea.open(argv[1]);     input_fileb.open(argv[2]);       // transfer input files data array     // input file arraya     row = 0;     while (getline(input_filea, line))     {          istringstream streama(line);         col = 0;         while(streama >> x){             tempaarray[row][col] = x;             col++;         }         row++;     }      // input file b arrayb     row = 0;     while (getline(input_fileb, line))     {          istringstream streamb(line);         col = 0;         while(streamb >> x){             tempbarray[row][col] = x;             col++;         }         row++;     }      // switch columns of b array     for(int row_of_arrayb = 0; row_of_arrayb < number_of_points; row_of_arrayb++ )     {         float temporary = tempbarray[row_of_arrayb][2];         tempbarray[row_of_arrayb][2] = tempbarray[row_of_arrayb][1];         tempbarray[row_of_arrayb][1] = temporary;     }      // array 3d vectors //    (int row_of_array = 0; row_of_array<number_of_points; row_of_array++) //    { //      inputaarray[row_of_array] = (tempaarray[row_of_array][0], tempaarray[row_of_array][1], tempaarray[row_of_array][2],0); //      inputbarray[row_of_array] = (tempbarray[row_of_array][0], tempbarray[row_of_array][1], tempbarray[row_of_array][2],0); //    }      (int row_of_array=0; row_of_array < number_of_points; row_of_array++)     {          inputaarray[row_of_array*4+0] = tempaarray[row_of_array][0];         inputaarray[row_of_array*4+1] = tempaarray[row_of_array][1];         inputaarray[row_of_array*4+2] = tempaarray[row_of_array][2];         inputaarray[row_of_array*4+3] = 0.0f;  //      inputaarray[row_of_array]= float(4) (tempaarray[row_of_array][0], tempaarray[row_of_array][1], tempaarray[row_of_array][2], 0.0f);          inputbarray[row_of_array*4+0] = tempbarray[row_of_array][0];         inputbarray[row_of_array*4+1] = tempbarray[row_of_array][1];         inputbarray[row_of_array*4+2] = tempbarray[row_of_array][2];         inputbarray[row_of_array*4+3] = 0.0f;          outputarray[row_of_array*4+0] = 0.0f;         outputarray[row_of_array*4+1] = 0.0f;         outputarray[row_of_array*4+2] = 0.0f;         outputarray[row_of_array*4+3] = 0.0f; //      inputbarray[row_of_array] = (tempbarray[row_of_array][0], tempbarray[row_of_array][1], tempbarray[row_of_array][2],0);      } //    (int row_of_array=0; row_of_array < number_of_points; row_of_array++) //    { //      printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputaarray[row_of_array*number_of_points+0], inputaarray[row_of_array*number_of_points+1], //              inputaarray[row_of_array*number_of_points+2], inputaarray[row_of_array*number_of_points+3]); //    }     // close input files     input_filea.close();     input_fileb.close();         // load kernel source code array source_str     file *fp;     char *source_str;     size_t source_size;      fp = fopen("calculate_bottom_snm_kernel.cl", "r");     if (!fp) {         fprintf(stderr, "failed load kernel.\n");         exit(1);     }      fseek(fp, 0, seek_end);     size_t programlength = ftell(fp);     rewind(fp);      source_str = (char*)malloc(programlength+1);     source_size = fread( source_str, 1, programlength, fp);     source_str[programlength] = '\0';     fclose( fp );      // platform , device information     cl_platform_id platform_id = null;     cl_device_id device_id = null;     cl_uint ret_num_devices;     cl_uint ret_num_platforms;     cl_int ret = clgetplatformids(1, &platform_id, &ret_num_platforms);     ret = clgetdeviceids( platform_id, cl_device_type_all, 1,             &device_id, &ret_num_devices);      // create opencl context     cl_context context = clcreatecontext( null, 1, &device_id, null, null, &ret);      // create command queue     cl_command_queue command_queue = clcreatecommandqueue(context, device_id, 0, &ret);      // create memory buffers on device each vector     cl_mem inputa_mem_obj = clcreatebuffer(context, cl_mem_read_only,             mem_size_inputa*sizeof(cl_float4) , null, &ret);     cl_mem inputb_mem_obj = clcreatebuffer(context, cl_mem_read_only,             mem_size_inputb*sizeof(cl_float4), null, &ret);      cl_mem output_mem_obj = clcreatebuffer(context, cl_mem_write_only,             mem_size_output*sizeof(cl_float4), null, &ret);       // copy lists , b respective memory buffers     ret = clenqueuewritebuffer(command_queue, inputa_mem_obj, cl_true, 0,             mem_size_inputa*sizeof(cl_float4), inputaarray, 0, null, null);     ret = clenqueuewritebuffer(command_queue, inputb_mem_obj, cl_true, 0,             mem_size_inputb*sizeof(cl_float4), inputbarray, 0, null, null);       // create program kernel source     cl_program program = clcreateprogramwithsource(context, 1,             (const char **)&source_str, (const size_t *)&source_size, &ret);      // build program      ret = clbuildprogram(program, 1, &device_id, null, null, null);     if (ret == cl_build_program_failure)       {         // size of build log         size_t logsize;         ret = clgetprogrambuildinfo(program, device_id, cl_program_build_log,                                     0, null, &logsize);         checkerror(ret, "getting build log size");          // build log         char log[logsize];         ret = clgetprogrambuildinfo(program, device_id, cl_program_build_log,                                     logsize, log, null);         checkerror(ret, "getting build log");          printf("opencl program build log:\n%s\n", log);         exit(1);       }       // create opencl kernel     cl_kernel kernel = clcreatekernel(program, "calculate_bottom_snm", &ret);      // set arguments of kernel     ret = clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);     ret = clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);     ret = clsetkernelarg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);      // execute opencl kernel on list     size_t global_item_size = number_of_points; // process entire lists     size_t local_item_size = 4; // process in groups of 64      ret = clenqueuendrangekernel(command_queue, kernel, 1, null,             &global_item_size, &local_item_size, 0, null, null);      // read memory buffer c on device local variable c //    int *c = (int*)malloc(sizeof(int)*number_of_points);   //    float *c = (float*)malloc(sizeof(float)*number_of_points);     ret = clenqueuereadbuffer(command_queue, output_mem_obj, cl_true, 0,             mem_size_output, outputarray, 0, null, null);       // display result screen //    float buttomsnm = 0;     for(i = 0; < number_of_points; i++)     {             printf("%f + %f = %f, \n",inputaarray[i*4+0],inputbarray[i*4+0], outputarray[i*4+0]);     }      // clean     ret = clflush(command_queue);     ret = clfinish(command_queue);     ret = clreleasekernel(kernel);     ret = clreleaseprogram(program);     ret = clreleasememobject(inputa_mem_obj);     ret = clreleasememobject(inputb_mem_obj);     ret = clreleasememobject(output_mem_obj);     ret = clreleasecommandqueue(command_queue);     ret = clreleasecontext(context);     free (inputaarray);     free (inputbarray);     free (outputarray);  printf("all time taken: %.2fs\n", (double)(clock() - tstart)/clocks_per_sec);     return 0; } 

kernel:

__kernel void calculate_bottom_snm(__global float4 *inputaarray, __global float4 *inputbarray,                          __global float4 *outputarray) {      // index of current element     int = get_global_id(0);      outputarray[i].x = inputaarray[i].x + inputbarray[i].x; // first component     outputarray[i].y = inputaarray[i].y + inputbarray[i].y; // second component     outputarray[i].z = inputaarray[i].z + inputbarray[i].z; // third component     outputarray[i].w = inputaarray[i].w + inputbarray[i].w; // third component  } 

the first input file a:

0   0.000000e+00    9.998994e-01     1   1.000000e-03    9.998981e-01     2   2.000000e-03    9.998967e-01     3   3.000000e-03    9.998953e-01     4   4.000000e-03    9.998939e-01     5   5.000000e-03    9.998925e-01     6   6.000000e-03    9.998911e-01     7   7.000000e-03    9.998896e-01     8   8.000000e-03    9.998881e-01     9   9.000000e-03    9.998865e-01     10  1.000000e-02    9.998850e-01     11  1.100000e-02    9.998834e-01     12  1.200000e-02    9.998817e-01     13  1.300000e-02    9.998800e-01     14  1.400000e-02    9.998783e-01     15  1.500000e-02    9.998766e-01 

the second input file b:

0   0.000000e+00    9.998966e-01     1   1.000000e-03    9.998953e-01     2   2.000000e-03    9.998939e-01     3   3.000000e-03    9.998925e-01     4   4.000000e-03    9.998911e-01     5   5.000000e-03    9.998896e-01     6   6.000000e-03    9.998881e-01     7   7.000000e-03    9.998866e-01     8   8.000000e-03    9.998850e-01     9   9.000000e-03    9.998834e-01     10  1.000000e-02    9.998818e-01     11  1.100000e-02    9.998801e-01     12  1.200000e-02    9.998785e-01     13  1.300000e-02    9.998767e-01     14  1.400000e-02    9.998750e-01     15  1.500000e-02    9.998732e-01 

the output should results of sum last 2 files, printed first columns it's same behavior others:

the output:

0.000000 + 0.000000 = 0.000000,  1.000000 + 1.000000 = 0.000000,  2.000000 + 2.000000 = 0.000000,  3.000000 + 3.000000 = 0.000000,  4.000000 + 4.000000 = 0.000000,  5.000000 + 5.000000 = 0.000000,  6.000000 + 6.000000 = 0.000000,  7.000000 + 7.000000 = 0.000000,  8.000000 + 8.000000 = 0.000000,  9.000000 + 9.000000 = 0.000000,  10.000000 + 10.000000 = 0.000000,  11.000000 + 11.000000 = 0.000000,  12.000000 + 12.000000 = 0.000000,  13.000000 + 13.000000 = 0.000000,  14.000000 + 14.000000 = 0.000000,  15.000000 + 15.000000 = 0.000000,  time taken: 0.07s 

thanks in advance,

you not copying correct number of bytes device host:

int mem_size_output = number_of_points ;  ...  ret = clenqueuereadbuffer(command_queue, output_mem_obj, cl_true, 0,         mem_size_output, outputarray, 0, null, null); 

the amount of data in buffer number_of_points * sizeof(cl_float4).


Comments

Popular posts from this blog

How to run C# code using mono without Xamarin in Android? -

c# - SharpSsh Command Execution -

python - Specify path of savefig with pylab or matplotlib -