How to implement nested loops in cuda thrust -


i have run nested loop follow:

for(int = 0; < n; i++){     for(int j = i+1; j <= n; j++){         compute(...)//some calculation here     } } 

i've tried leaving first loop in cpu , second loop in gpu. results too many memory access. there other ways it? example thrust::reduce_by_key?

the whole program here:

#include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/generate.h> #include <thrust/sort.h> #include <thrust/binary_search.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/random.h> #include <cmath> #include <iostream> #include <iomanip>  #define n 1000000  // define 2d point pair typedef thrust::tuple<float, float> point;  // return random point in [0,1)^2 point make_point(void) {   static thrust::default_random_engine rng(12345);   static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);   float x = dist(rng);   float y = dist(rng);   return point(x,y); }  struct sqrt_dis: public thrust::unary_function<point, double> {   float x, y;   double tmp;   sqrt_dis(float _x, float _y): x(_x), y(_y){}   __host__ __device__   float operator()(point a)  {     tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\     (thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);     tmp = -1.0*(sqrt(tmp));     return (1.0/tmp);  }  };   int main(void) {   clock_t t1, t2;   double result;    t1 = clock();   // allocate random points in unit square on host   thrust::host_vector<point> h_points(n);   thrust::generate(h_points.begin(), h_points.end(), make_point);    // transfer device   thrust::device_vector<point> points = h_points;    thrust::plus<double> binary_op;   float init = 0;    for(int = 0; < n; i++){     point tmp_i = points[i];     float x = thrust::get<0>(tmp_i);     float y = thrust::get<1>(tmp_i);     result += thrust::transform_reduce(points.begin()+i,\                                        points.end(),sqrt_dis(x,y),\                                        init,binary_op);     std::cout<<"result"<<i<<": "<<result<<std::endl;   }   t2 = clock()-t1;   std::cout<<"result: ";   std::cout.precision(10);   std::cout<< result <<std::endl;   std::cout<<"run time: "<<t2/clocks_per_sec<<"s"<<std::endl;   return 0;  } 

edit: have posted example, here how solve it:

you have n 2d points stored in linear array (here n=4)

points = [p0 p1 p2 p3] 

based on code assume want calculate:

result = f(p0, p1) + f(p0, p2) + f(p0, p3) +          f(p1, p2) + f(p1, p3) +          f(p2, p3) 

where f() distance function needs executed m times in total:

m = (n-1)*n/2 

in example: m=6

you can @ problem triangular matrix:

[ p0 p1 p2 p3 ]  [    p1 p2 p3 ] [       p2 p3 ] [          p3 ] 

transforming matrix linear vector m elements while leaving out diagonal elements results in:

[p1 p2 p3 p2 p3 p3] 

the index of element in vector k = [0,m-1]. index k can remapped columns , rows of triangular matrix k -> (i,j):

i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5) j = k + + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2 

i row , j column.

in our example:

0 -> (0, 1) 1 -> (0, 2) 2 -> (0, 3) 3 -> (1, 2) 4 -> (1, 3) 5 -> (2, 3) 

now can put , execute modified distance functor m times applies aforementioned mapping corresponding pairs based on index , sum everything.

i modified code accordingly:

#include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/transform_reduce.h> #include <thrust/random.h> #include <math.h> #include <iostream> #include <stdio.h> #include <stdint.h>  #define print_debug  typedef float float;  // define 2d point pair typedef thrust::tuple<float, float> point;  // return random point in [0,1)^2 point make_point(void) {     static thrust::default_random_engine rng(12345);     static thrust::uniform_real_distribution<float> dist(0.0, 1.0);     float x = dist(rng);     float y = dist(rng);     return point(x,y); }   struct sqrt_dis_new {     typedef thrust::device_ptr<point> devptr;      devptr points;     const uint64_t n;      __host__     sqrt_dis_new(uint64_t n, devptr p) : n(n), points(p)     {     }      __device__      float operator()(uint64_t k) const     {         // calculate indices in triangular matrix         const uint64_t = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);         const uint64_t j = k + + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;  #ifdef print_debug         printf("%llu -> (%llu, %llu)\n", k,i,j); #endif          const point& p1 = *(points.get()+j);         const point& p2 = *(points.get()+i);          const float xm = thrust::get<0>(p1)-thrust::get<0>(p2);         const float ym = thrust::get<1>(p1)-thrust::get<1>(p2);          return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));     } };   int main() {     const uint64_t n = 4;      // allocate random points in unit square on host     thrust::host_vector<point> h_points(n);     thrust::generate(h_points.begin(), h_points.end(), make_point);      // transfer device     thrust::device_vector<point> d_points = h_points;      const uint64_t count = (n-1)*n/2;      std::cout << count << std::endl;       thrust::plus<float> binary_op;     const float init = 0.0;      float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),                                             thrust::make_counting_iterator(count),                                             sqrt_dis_new(n, d_points.data()),                                             init,                                             binary_op);      std::cout.precision(10);       std::cout<<"result: " << result << std::endl;      return 0; }     

it depends on compute function not specify. unroll loops , launch kernel in 2d manner every combination of i , j if computations independent. have @ thrust examples , identify similar use cases problem.


Comments

Popular posts from this blog

google chrome - Developer tools - How to inspect the elements which are added momentarily (by JQuery)? -

angularjs - Showing an empty as first option in select tag -

php - Cloud9 cloud IDE and CakePHP -