ext/cumo/narray/index.c in cumo-0.2.1 vs ext/cumo/narray/index.c in cumo-0.2.2
- old
+ new
@@ -389,16 +389,18 @@
for (; i>0; i--) {
strides[i-1] = strides[i] * na->base.shape[i];
}
}
+void cumo_na_index_aref_nadata_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n);
+
static void
cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
{
int i, j;
- ssize_t size, k, total=1;
+ ssize_t size, total=1;
ssize_t stride1;
ssize_t *strides_na1;
size_t *index;
ssize_t beg, step;
VALUE m;
@@ -423,19 +425,14 @@
na2->base.reduce = rb_funcall(m,'|',1,na2->base.reduce);
}
// array index
if (q[i].idx != NULL) {
- CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
-
index = q[i].idx;
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
q[i].idx = NULL;
- for (k=0; k<size; k++) {
- index[k] = index[k] * stride1;
- }
+ cumo_na_index_aref_nadata_index_stride_kernel_launch(index, stride1, size);
} else {
beg = q[i].beg;
step = q[i].step;
na2->offset += stride1*beg;
CUMO_SDX_SET_STRIDE(na2->stridx[j], stride1*step);
@@ -445,10 +442,15 @@
}
na2->base.size = total;
}
+void cumo_na_index_aref_naview_index_index_kernel_launch(size_t *idx, size_t *idx1, uint64_t n);
+void cumo_na_index_aref_naview_index_stride_last_kernel_launch(size_t *idx, ssize_t s1, size_t last, uint64_t n);
+void cumo_na_index_aref_naview_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n);
+void cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n);
+
static void
cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
{
int i, j;
@@ -479,67 +481,44 @@
// new dimension
CUMO_SDX_SET_STRIDE(na2->stridx[j], elmsz);
}
else if (q[i].idx != NULL && CUMO_SDX_IS_INDEX(sdx1)) {
// index <- index
- int k;
size_t *index = q[i].idx;
-
- CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
-
+ size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
CUMO_SDX_SET_INDEX(na2->stridx[j], index);
q[i].idx = NULL;
-
- for (k=0; k<size; k++) {
- index[k] = CUMO_SDX_GET_INDEX(sdx1)[index[k]];
- }
+ cumo_na_index_aref_naview_index_index_kernel_launch(index, index1, size);
}
else if (q[i].idx != NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
// index <- step
ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
size_t *index = q[i].idx;
-
- CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
-
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
q[i].idx = NULL;
if (stride1<0) {
size_t last;
- int k;
stride1 = -stride1;
last = na1->base.shape[q[i].orig_dim] - 1;
if (na2->offset < last * stride1) {
rb_raise(rb_eStandardError,"bug: negative offset");
}
na2->offset -= last * stride1;
- for (k=0; k<size; k++) {
- index[k] = (last - index[k]) * stride1;
- }
+ cumo_na_index_aref_naview_index_stride_last_kernel_launch(index, stride1, last, size);
} else {
- int k;
- for (k=0; k<size; k++) {
- index[k] = index[k] * stride1;
- }
+ cumo_na_index_aref_naview_index_stride_kernel_launch(index, stride1, size);
}
}
else if (q[i].idx == NULL && CUMO_SDX_IS_INDEX(sdx1)) {
// step <- index
- int k;
size_t beg = q[i].beg;
ssize_t step = q[i].step;
// size_t *index = ALLOC_N(size_t, size);
size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
+ size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
-
- CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
-
- for (k=0; k<size; k++) {
- index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
- }
+ cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(index, index1, beg, step, size);
}
else if (q[i].idx == NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
// step <- step
size_t beg = q[i].beg;
ssize_t step = q[i].step;