PTX and SASS (Nvidia)¶
C Kernel to Source Code¶
In [2]:
import numpy as np
import pyopencl as cl
import pyopencl.array as cla
ctx = cl.create_some_context(answers=["nvi", 0])
queue = cl.CommandQueue(ctx)
In [3]:
prg = cl.Program(ctx, """
__kernel void sum(
__global const float *a_g, __global const float *b_g, __global float *res_g)
{
int gid = get_global_id(0);
res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()
print(prg.binaries[0].decode())
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35042711
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//
.version 8.2
.target sm_70, texmode_independent
.address_size 64
// .globl sum
.entry sum(
.param .u64 .ptr .global .align 4 sum_param_0,
.param .u64 .ptr .global .align 4 sum_param_1,
.param .u64 .ptr .global .align 4 sum_param_2
)
{
.reg .f32 %f<4>;
.reg .b32 %r<7>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [sum_param_0];
ld.param.u64 %rd2, [sum_param_1];
ld.param.u64 %rd3, [sum_param_2];
mov.b32 %r1, %envreg3;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
add.s32 %r5, %r4, %r1;
mad.lo.s32 %r6, %r2, %r3, %r5;
mul.wide.s32 %rd4, %r6, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f1, [%rd5];
add.s64 %rd6, %rd2, %rd4;
ld.global.f32 %f2, [%rd6];
add.f32 %f3, %f1, %f2;
add.s64 %rd7, %rd3, %rd4;
st.global.f32 [%rd7], %f3;
ret;
}
Comments:
- Intel or AT&T style?
- Note: address spaces always explicit
- What is
ctaid.x?%ntid.x? - How does parameter passing work?
- Is this the lowest-level abstraction?
In [10]:
!mkdir -p tmp
binary = prg.binaries[0]
with open("tmp/binary.ptx", "wb") as outf:
outf.write(hacked_binary)
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'sum' for 'sm_70'
ptxas info : Function properties for sum
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 12 registers, 376 bytes cmem[0]
In [11]:
!cuobjdump --dump-sass tmp/binary.o
code for sm_70
Function : sum
.headerflags @"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /* 0x000000fffffff389 */
/* 0x000fe200000e00ff */
/*0020*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0030*/ MOV R7, 0x4 ; /* 0x0000000400077802 */
/* 0x000fc60000000f00 */
/*0040*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e620000002500 */
/*0050*/ IADD3 R0, R0, c[0x0][0x94], RZ ; /* 0x0000250000007a10 */
/* 0x001fca0007ffe0ff */
/*0060*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x002fc800078e0200 */
/*0070*/ IMAD.WIDE R2, R0, R7, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x000fc800078e0207 */
/*0080*/ IMAD.WIDE R4, R0, R7, c[0x0][0x168] ; /* 0x00005a0000047625 */
/* 0x000fc800078e0207 */
/*0090*/ LDG.E.SYS R2, [R2] ; /* 0x0000000002027381 */
/* 0x000ea800001ee900 */
/*00a0*/ LDG.E.SYS R5, [R4] ; /* 0x0000000004057381 */
/* 0x000ea200001ee900 */
/*00b0*/ IMAD.WIDE R6, R0, R7, c[0x0][0x170] ; /* 0x00005c0000067625 */
/* 0x000fc800078e0207 */
/*00c0*/ FADD R9, R2, R5 ; /* 0x0000000502097221 */
/* 0x004fd00000000000 */
/*00d0*/ STG.E.SYS [R6], R9 ; /* 0x0000000906007386 */
/* 0x000fe2000010e900 */
/*00e0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*00f0*/ BRA 0xf0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
..........
Is Division Expensive?¶
In [13]:
prg = cl.Program(ctx, """
__kernel void sum(
__global float *a_g, int n)
{
int gid = get_global_id(0);
// try dividing by n
int row = gid / 117;
int col = gid % 117;
a_g[row * 128 + col] *= 2;
// a_g[gid] *= 2;
}
""").build()
binary = prg.binaries[0]
with open("tmp/binary.ptx", "wb") as outf:
outf.write(hacked_binary)
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)
!cuobjdump --dump-sass tmp/binary.o | cut -c -80
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'sum' for 'sm_70'
ptxas info : Function properties for sum
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 12 registers, 376 bytes cmem[0]
code for sm_70
Function : sum
.headerflags @"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 E
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x0
/* 0x0
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /* 0x0
/* 0x0
/*0020*/ S2R R0, SR_TID.X ; /* 0x0
/* 0x0
/*0030*/ MOV R7, 0x4 ; /* 0x0
/* 0x0
/*0040*/ S2R R3, SR_CTAID.X ; /* 0x0
/* 0x0
/*0050*/ IADD3 R0, R0, c[0x0][0x94], RZ ; /* 0x0
/* 0x0
/*0060*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0
/* 0x0
/*0070*/ IMAD.WIDE R2, R0, R7, c[0x0][0x160] ; /* 0x0
/* 0x0
/*0080*/ IMAD.WIDE R4, R0, R7, c[0x0][0x168] ; /* 0x0
/* 0x0
/*0090*/ LDG.E.SYS R2, [R2] ; /* 0x0
/* 0x0
/*00a0*/ LDG.E.SYS R5, [R4] ; /* 0x0
/* 0x0
/*00b0*/ IMAD.WIDE R6, R0, R7, c[0x0][0x170] ; /* 0x0
/* 0x0
/*00c0*/ FADD R9, R2, R5 ; /* 0x0
/* 0x0
/*00d0*/ STG.E.SYS [R6], R9 ; /* 0x0
/* 0x0
/*00e0*/ EXIT ; /* 0x0
/* 0x0
/*00f0*/ BRA 0xf0; /* 0xf
/* 0x0
..........
An Example with Control Flow¶
In [18]:
prg = cl.Program(ctx, """
__kernel void sum(
__global const float *a_g, __global const float *b_g, __global float *res_g, int n)
{
int gsize = get_global_size(0);
for (int i = get_global_id(0); i < n; i += gsize)
res_g[i] = a_g[i] + b_g[i];
res_g[get_global_id(0)] = 15;
}
""").build()
binary = prg.binaries[0]
#print(binary.decode())
with open("tmp/binary.ptx", "wb") as outf:
outf.write(binary)
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)
!cuobjdump --dump-sass tmp/binary.o | cut -c -80
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'sum' for 'sm_70'
ptxas info : Function properties for sum
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 28 registers, 380 bytes cmem[0]
code for sm_70
Function : sum
.headerflags @"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 E
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
/*0020*/ S2R R0, SR_TID.X ;
/*0030*/ BMOV.32.CLEAR RZ, B0 ;
/*0040*/ BSSY B0, 0x670 ;
/*0050*/ S2R R3, SR_CTAID.X ;
/*0060*/ IADD3 R0, R0, c[0x0][0x94], RZ ;
/*0070*/ IMAD R0, R3, c[0x0][0x0], R0 ;
/*0080*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x178], PT ;
/*0090*/ SHF.R.S32.HI R3, RZ, 0x1f, R0 ;
/*00a0*/ @P0 BRA 0x660 ;
/*00b0*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x0] ;
/*00c0*/ MOV R6, RZ ;
/*00d0*/ IMAD.MOV.U32 R5, RZ, RZ, R0 ;
/*00e0*/ BMOV.32.CLEAR RZ, B1 ;
/*00f0*/ IMAD R2, R2, c[0x0][0xa0], RZ ;
/*0100*/ BSSY B1, 0x440 ;
/*0110*/ LOP3.LUT R4, RZ, R5, RZ, 0x33, !PT ;
/*0120*/ IADD3 R9, RZ, -R2, RZ ;
/*0130*/ I2F.U32.RP R8, R2 ;
/*0140*/ ISETP.NE.U32.AND P2, PT, R2, RZ, PT ;
/*0150*/ MUFU.RCP R8, R8 ;
/*0160*/ IADD3 R10, R8, 0xffffffe, RZ ;
/*0170*/ F2I.FTZ.U32.TRUNC.NTZ R7, R10 ;
/*0180*/ IMAD R9, R9, R7, RZ ;
/*0190*/ IMAD.WIDE.U32 R6, R7, R9, R6 ;
/*01a0*/ IADD3 R9, R4, c[0x0][0x178], RZ ;
/*01b0*/ IMAD.WIDE.U32 R6, R7, R9, RZ ;
/*01c0*/ IMAD.MOV R4, RZ, RZ, -R7 ;
/*01d0*/ IMAD R9, R2, R4, R9 ;
/*01e0*/ ISETP.GE.U32.AND P0, PT, R9, R2, PT ;
/*01f0*/ @P0 IADD3 R9, -R2, R9, RZ ;
/*0200*/ @P0 IADD3 R7, R7, 0x1, RZ ;
/*0210*/ ISETP.GE.U32.AND P1, PT, R9, R2, PT ;
/*0220*/ @P1 IADD3 R7, R7, 0x1, RZ ;
/*0230*/ @!P2 LOP3.LUT R7, RZ, R2, RZ, 0x33, !PT ;
/*0240*/ IADD3 R4, R7.reuse, 0x1, RZ ;
/*0250*/ ISETP.GE.U32.AND P0, PT, R7, 0x3, PT ;
/*0260*/ LOP3.LUT P1, R4, R4, 0x3, RZ, 0xc0, !PT ;
/*0270*/ @!P1 BRA 0x430 ;
/*0280*/ IMAD.SHL.U32 R8, R0.reuse, 0x4, RZ ;
/*0290*/ SHF.L.U64.HI R9, R0, 0x2, R3 ;
/*02a0*/ IADD3 R18, P1, R8.reuse, c[0x0][0x170], RZ ;
/*02b0*/ IADD3 R6, P2, R8.reuse, c[0x0][0x168], RZ ;
/*02c0*/ IADD3 R8, P3, R8, c[0x0][0x160], RZ ;
/*02d0*/ IADD3.X R21, R9.reuse, c[0x0][0x174], RZ, P1,
/*02e0*/ IADD3.X R7, R9.reuse, c[0x0][0x16c], RZ, P2,
/*02f0*/ IADD3.X R9, R9, c[0x0][0x164], RZ, P3, !PT ;
/*0300*/ LDG.E.SYS R10, [R6] ;
/*0310*/ LDG.E.SYS R11, [R8] ;
/*0320*/ MOV R16, R18 ;
/*0330*/ IMAD.MOV.U32 R17, RZ, RZ, R21 ;
/*0340*/ IADD3 R4, R4, -0x1, RZ ;
/*0350*/ IMAD.WIDE R12, R2.reuse, 0x4, R6 ;
/*0360*/ IADD3 R5, R2, R5, RZ ;
/*0370*/ ISETP.NE.AND P1, PT, R4, RZ, PT ;
/*0380*/ IMAD.WIDE R14, R2, 0x4, R8 ;
/*0390*/ MOV R6, R12 ;
/*03a0*/ IMAD.MOV.U32 R7, RZ, RZ, R13 ;
/*03b0*/ MOV R8, R14 ;
/*03c0*/ IMAD.MOV.U32 R9, RZ, RZ, R15 ;
/*03d0*/ FADD R19, R10, R11 ;
/*03e0*/ IMAD.WIDE R10, R2, 0x4, R16 ;
/*03f0*/ IMAD.MOV.U32 R21, RZ, RZ, R11 ;
/*0400*/ MOV R18, R10 ;
/*0410*/ STG.E.SYS [R16], R19 ;
/*0420*/ @P1 BRA 0x300 ;
/*0430*/ BSYNC B1 ;
/*0440*/ @!P0 BRA 0x660 ;
/*0450*/ IMAD.MOV.U32 R10, RZ, RZ, 0x4 ;
/*0460*/ IMAD.WIDE R8, R5, R10, c[0x0][0x168] ;
/*0470*/ IMAD.WIDE R6, R5, R10, c[0x0][0x160] ;
/*0480*/ LDG.E.SYS R4, [R8] ;
/*0490*/ LDG.E.SYS R13, [R6] ;
/*04a0*/ IMAD.WIDE R10, R5, R10, c[0x0][0x170] ;
/*04b0*/ IMAD.WIDE R14, R2, 0x4, R8 ;
/*04c0*/ FADD R21, R4, R13 ;
/*04d0*/ IMAD.WIDE R12, R2, 0x4, R6 ;
/*04e0*/ STG.E.SYS [R10], R21 ;
/*04f0*/ LDG.E.SYS R4, [R14] ;
/*0500*/ LDG.E.SYS R19, [R12] ;
/*0510*/ IMAD.WIDE R16, R2, 0x4, R10 ;
/*0520*/ IMAD.WIDE R8, R2, 0x4, R14 ;
/*0530*/ IMAD.WIDE R6, R2, 0x4, R12 ;
/*0540*/ FADD R23, R4, R19 ;
/*0550*/ STG.E.SYS [R16], R23 ;
/*0560*/ LDG.E.SYS R4, [R8] ;
/*0570*/ LDG.E.SYS R25, [R6] ;
/*0580*/ IMAD.WIDE R18, R2, 0x4, R16 ;
/*0590*/ IMAD.WIDE R14, R2, 0x4, R8 ;
/*05a0*/ IMAD.WIDE R10, R2, 0x4, R6 ;
/*05b0*/ FADD R25, R4, R25 ;
/*05c0*/ STG.E.SYS [R18], R25 ;
/*05d0*/ LDG.E.SYS R14, [R14] ;
/*05e0*/ LDG.E.SYS R11, [R10] ;
/*05f0*/ IMAD.WIDE R12, R2.reuse, 0x4, R18 ;
/*0600*/ IADD3 R5, R2, R5, R2 ;
/*0610*/ IADD3 R5, R2, R5, R2 ;
/*0620*/ ISETP.GE.AND P0, PT, R5, c[0x0][0x178], PT ;
/*0630*/ FADD R17, R14, R11 ;
/*0640*/ STG.E.SYS [R12], R17 ;
/*0650*/ @!P0 BRA 0x450 ;
/*0660*/ BSYNC B0 ;
/*0670*/ LEA R2, P0, R0.reuse, c[0x0][0x170], 0x2 ;
/*0680*/ MOV R5, 0x41700000 ;
/*0690*/ LEA.HI.X R3, R0, c[0x0][0x174], R3, 0x2, P0 ;
/*06a0*/ STG.E.SYS [R2], R5 ;
/*06b0*/ EXIT ;
/*06c0*/ BRA 0x6c0;
/*06d0*/ NOP;
/*06e0*/ NOP;
/*06f0*/ NOP;
..........
- Spot something that doesn't quite seem to belong?
From CUDA¶
Vector add stolen from ORNL.
In [20]:
%%writefile tmp/vector-add.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
double *h_a;
double *h_b;
double *h_c;
double *d_a;
double *d_b;
double *d_c;
size_t bytes = n*sizeof(double);
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
blockSize = 1024;
gridSize = (int)ceil((float)n/blockSize);
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Writing tmp/vector-add.cu
In [21]:
!(cd tmp; nvcc -c vector-add.cu)
!cuobjdump --dump-sass tmp/vector-add.o
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_52
Function : _Z6vecAddPdS_S_i
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
/* 0x001cfc00e22007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0xf0c8000002570000 */
/*0018*/ S2R R2, SR_TID.X ; /* 0xf0c8000002170002 */
/* 0x001fd842fec20ff1 */
/*0028*/ XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ; /* 0x4f107f8000270003 */
/*0030*/ XMAD R2, R0.reuse, c[0x0] [0x8], R2 ; /* 0x4e00010000270002 */
/*0038*/ XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ; /* 0x5b30011800370000 */
/* 0x001ff400fd4007ed */
/*0048*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT ; /* 0x4b6d038005670007 */
/*0050*/ NOP ; /* 0x50b0000000070f00 */
/*0058*/ @P0 EXIT ; /* 0xe30000000000000f */
/* 0x081fd800fea207f1 */
/*0068*/ SHL R8, R0.reuse, 0x3 ; /* 0x3848000000370008 */
/*0070*/ SHR R0, R0, 0x1d ; /* 0x3829000001d70000 */
/*0078*/ IADD R4.CC, R8.reuse, c[0x0][0x140] ; /* 0x4c10800005070804 */
/* 0x001fd800fe0207f2 */
/*0088*/ IADD.X R5, R0.reuse, c[0x0][0x144] ; /* 0x4c10080005170005 */
/*0090*/ IADD R6.CC, R8, c[0x0][0x148] ; /* 0x4c10800005270806 */
/*0098*/ LDG.E.64 R4, [R4] ; /* 0xeed5200000070404 */
/* 0x001fd800f62007e2 */
/*00a8*/ IADD.X R7, R0, c[0x0][0x14c] ; /* 0x4c10080005370007 */
/*00b0*/ LDG.E.64 R2, [R6] ; /* 0xeed5200000070602 */
/*00b8*/ IADD R8.CC, R8, c[0x0][0x150] ; /* 0x4c10800005470808 */
/* 0x003fc420e28007f7 */
/*00c8*/ IADD.X R9, R0, c[0x0][0x154] ; /* 0x4c10080005570009 */
/*00d0*/ DADD R2, R2, R4 ; /* 0x5c70000000470202 */
/*00d8*/ STG.E.64 [R8], R2 ; /* 0xeedd200000070802 */
/* 0x001ffc00ffe007e8 */
/*00e8*/ NOP ; /* 0x50b0000000070f00 */
/*00f0*/ EXIT ; /* 0xe30000000007000f */
/*00f8*/ BRA 0xf8 ; /* 0xe2400fffff87000f */
..........
Fatbin ptx code:
================
arch = sm_52
code version = [8,2]
host = linux
compile_size = 64bit
compressed
- What is
_Z6vecAddPdS_S_i?
In [36]:
!echo _Z6vecAddPdS_S_i | c++filt
vecAdd(double*, double*, double*, int)
Inline PTX¶
In [22]:
prg = cl.Program(ctx, """
__kernel void getlaneid(__global int *d_ptr, int length)
{
int elemID = get_global_id(0);
if (elemID < length)
{
unsigned int laneid;
asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
d_ptr[elemID] = laneid;
}
}
""").build()
print(prg.binaries[0].decode())
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35042711
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//
.version 8.2
.target sm_70, texmode_independent
.address_size 64
// .globl getlaneid
.entry getlaneid(
.param .u64 .ptr .global .align 4 getlaneid_param_0,
.param .u32 getlaneid_param_1
)
{
.reg .pred %p<2>;
.reg .b32 %r<9>;
.reg .b64 %rd<4>;
ld.param.u64 %rd1, [getlaneid_param_0];
ld.param.u32 %r2, [getlaneid_param_1];
mov.b32 %r3, %envreg3;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r6, %r3;
mad.lo.s32 %r1, %r5, %r4, %r7;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;
// begin inline asm
mov.u32 %r8, %laneid;
// end inline asm
mul.wide.s32 %rd2, %r1, 4;
add.s64 %rd3, %rd1, %rd2;
st.global.u32 [%rd3], %r8;
$L__BB0_2:
ret;
}
- What do the constraints mean again?
- Spot the inline assembly
- Observe how the
ifis realized - Observe the realization of
get_global_id()
In [48]:
a = cla.empty(queue, 5000, np.uint32)
prg.getlaneid(queue, lanes.shape, None, a.data, np.uint32(a.size))
Out[48]:
<pyopencl._cl.Event at 0x7fa528c8b360>
In [50]:
a[:500]
Out[50]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25], dtype=uint32)
In [ ]: