How to implement nested loops in cuda thrust -
i have run nested loop follow:
for(int = 0; < n; i++){ for(int j = i+1; j <= n; j++){ compute(...)//some calculation here } }
i've tried leaving first loop in cpu
, second loop in gpu
. results too many memory access
. there other ways it? example thrust::reduce_by_key
?
the whole program here:
#include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/generate.h> #include <thrust/sort.h> #include <thrust/binary_search.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/random.h> #include <cmath> #include <iostream> #include <iomanip> #define n 1000000 // define 2d point pair typedef thrust::tuple<float, float> point; // return random point in [0,1)^2 point make_point(void) { static thrust::default_random_engine rng(12345); static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f); float x = dist(rng); float y = dist(rng); return point(x,y); } struct sqrt_dis: public thrust::unary_function<point, double> { float x, y; double tmp; sqrt_dis(float _x, float _y): x(_x), y(_y){} __host__ __device__ float operator()(point a) { tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\ (thrust::get<1>(a)-y)*(thrust::get<1>(a)-y); tmp = -1.0*(sqrt(tmp)); return (1.0/tmp); } }; int main(void) { clock_t t1, t2; double result; t1 = clock(); // allocate random points in unit square on host thrust::host_vector<point> h_points(n); thrust::generate(h_points.begin(), h_points.end(), make_point); // transfer device thrust::device_vector<point> points = h_points; thrust::plus<double> binary_op; float init = 0; for(int = 0; < n; i++){ point tmp_i = points[i]; float x = thrust::get<0>(tmp_i); float y = thrust::get<1>(tmp_i); result += thrust::transform_reduce(points.begin()+i,\ points.end(),sqrt_dis(x,y),\ init,binary_op); std::cout<<"result"<<i<<": "<<result<<std::endl; } t2 = clock()-t1; std::cout<<"result: "; std::cout.precision(10); std::cout<< result <<std::endl; std::cout<<"run time: "<<t2/clocks_per_sec<<"s"<<std::endl; return 0; }
edit: have posted example, here how solve it:
you have n
2d points stored in linear array (here n=4
)
points = [p0 p1 p2 p3]
based on code assume want calculate:
result = f(p0, p1) + f(p0, p2) + f(p0, p3) + f(p1, p2) + f(p1, p3) + f(p2, p3)
where f()
distance function needs executed m
times in total:
m = (n-1)*n/2
in example: m=6
you can @ problem triangular matrix:
[ p0 p1 p2 p3 ] [ p1 p2 p3 ] [ p2 p3 ] [ p3 ]
transforming matrix linear vector m
elements while leaving out diagonal elements results in:
[p1 p2 p3 p2 p3 p3]
the index of element in vector k = [0,m-1]
. index k
can remapped columns , rows of triangular matrix k -> (i,j)
:
i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5) j = k + + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2
i
row , j
column.
in our example:
0 -> (0, 1) 1 -> (0, 2) 2 -> (0, 3) 3 -> (1, 2) 4 -> (1, 3) 5 -> (2, 3)
now can put , execute modified distance functor m
times applies aforementioned mapping corresponding pairs based on index , sum everything.
i modified code accordingly:
#include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/transform_reduce.h> #include <thrust/random.h> #include <math.h> #include <iostream> #include <stdio.h> #include <stdint.h> #define print_debug typedef float float; // define 2d point pair typedef thrust::tuple<float, float> point; // return random point in [0,1)^2 point make_point(void) { static thrust::default_random_engine rng(12345); static thrust::uniform_real_distribution<float> dist(0.0, 1.0); float x = dist(rng); float y = dist(rng); return point(x,y); } struct sqrt_dis_new { typedef thrust::device_ptr<point> devptr; devptr points; const uint64_t n; __host__ sqrt_dis_new(uint64_t n, devptr p) : n(n), points(p) { } __device__ float operator()(uint64_t k) const { // calculate indices in triangular matrix const uint64_t = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5); const uint64_t j = k + + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2; #ifdef print_debug printf("%llu -> (%llu, %llu)\n", k,i,j); #endif const point& p1 = *(points.get()+j); const point& p2 = *(points.get()+i); const float xm = thrust::get<0>(p1)-thrust::get<0>(p2); const float ym = thrust::get<1>(p1)-thrust::get<1>(p2); return 1.0/(-1.0 * sqrt(xm*xm + ym*ym)); } }; int main() { const uint64_t n = 4; // allocate random points in unit square on host thrust::host_vector<point> h_points(n); thrust::generate(h_points.begin(), h_points.end(), make_point); // transfer device thrust::device_vector<point> d_points = h_points; const uint64_t count = (n-1)*n/2; std::cout << count << std::endl; thrust::plus<float> binary_op; const float init = 0.0; float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0), thrust::make_counting_iterator(count), sqrt_dis_new(n, d_points.data()), init, binary_op); std::cout.precision(10); std::cout<<"result: " << result << std::endl; return 0; }
it depends on compute
function not specify. unroll loops , launch kernel in 2d manner every combination of i
, j
if computations independent. have @ thrust examples , identify similar use cases problem.
Comments
Post a Comment