Cuda学习入门 1.

关于俺的显卡

NVidia NVS 4200M
support opencl, directx11, DirectCompute, OpenGL 2.1
Memory Amount 1GB
CUDA compute capability 2.1

步骤

1. download cuda5 from https://developer.nvidia.com/thrust
2. install
中间有一步安装toolkit失败。。（为啥呀为啥？）

3. 编译例子程序成功，但是运行失败
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\CUDA Sam
ples\v5.0\bin\win32\Debug>vectorAdd.exe
[Vector addition of 50000 elements]
Failed to allocate device vector A (error code CUDA driver version is insufficient for CUDA runtime version)! （为啥呀为啥？）

以下是例子程序

/**
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}

/**
 * Host main routine
 */
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    // Print the vector length to be used, and compute its size
    int numElements = 50000;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    // Allocate the host input vector A
    float *h_A = (float *)malloc(size);

    // Allocate the host input vector B
    float *h_B = (float *)malloc(size);

    // Allocate the host output vector C
    float *h_C = (float *)malloc(size);

    // Verify that allocations succeeded
    if (h_A == NULL || h_B == NULL || h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    // Allocate the device input vector A
    float *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device input vector B
    float *d_B = NULL;
    err = cudaMalloc((void **)&d_B, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output vector C
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_C, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    // Free device global memory
    err = cudaFree(d_A);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_B);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_C);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    // Reset the device and exit
    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

View Code

4. 安装后，有个DeviceQuery.exe，运行失败

F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G
PU Computing SDK 4.0\C\bin\win32\Release>deviceQuery.exe
[deviceQuery.exe] starting...
deviceQuery.exe Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

cudaGetDeviceCount returned 35
-> CUDA driver version is insufficient for CUDA runtime version
[deviceQuery.exe] test results...
FAILED

View Code

5. 俺看到还有个oclDeviceQuery.exe，这应该是opencl版本的测试程序。点了一下这个能运行，以下是输出信息。

[oclDeviceQuery.exe] starting...
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G
PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclDeviceQuery.exe Starting...

OpenCL SW Info:

 CL_PLATFORM_NAME:      NVIDIA CUDA
 CL_PLATFORM_VERSION:   OpenCL 1.0 CUDA 3.2.1
 OpenCL SDK Revision:   7027912


OpenCL Device Info:

 1 devices found supporting OpenCL:

 ---------------------------------
 Device NVS 4200M
 ---------------------------------
  CL_DEVICE_NAME:                       NVS 4200M
  CL_DEVICE_VENDOR:                     NVIDIA Corporation
  CL_DRIVER_VERSION:                    268.24
  CL_DEVICE_VERSION:                    OpenCL 1.0 CUDA
  CL_DEVICE_TYPE:                       CL_DEVICE_TYPE_GPU
  CL_DEVICE_MAX_COMPUTE_UNITS:          1
  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   3
  CL_DEVICE_MAX_WORK_ITEM_SIZES:        1024 / 1024 / 64
  CL_DEVICE_MAX_WORK_GROUP_SIZE:        1024
  CL_DEVICE_MAX_CLOCK_FREQUENCY:        1620 MHz
  CL_DEVICE_ADDRESS_BITS:               32
  CL_DEVICE_MAX_MEM_ALLOC_SIZE:         255 MByte
  CL_DEVICE_GLOBAL_MEM_SIZE:            1023 MByte
  CL_DEVICE_ERROR_CORRECTION_SUPPORT:   no
  CL_DEVICE_LOCAL_MEM_TYPE:             local
  CL_DEVICE_LOCAL_MEM_SIZE:             48 KByte
  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:   64 KByte
  CL_DEVICE_QUEUE_PROPERTIES:           CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
  CL_DEVICE_QUEUE_PROPERTIES:           CL_QUEUE_PROFILING_ENABLE
  CL_DEVICE_IMAGE_SUPPORT:              1
  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
  CL_DEVICE_SINGLE_FP_CONFIG:           denorms INF-quietNaNs round-to-nearest r
ound-to-zero round-to-inf fma

  CL_DEVICE_IMAGE <dim>                 2D_MAX_WIDTH     4096
                                        2D_MAX_HEIGHT    32768
                                        3D_MAX_WIDTH     2048
                                        3D_MAX_HEIGHT    2048
                                        3D_MAX_DEPTH     2048

  CL_DEVICE_EXTENSIONS:                 cl_khr_byte_addressable_store
                                        cl_khr_icd
                                        cl_khr_gl_sharing
                                        cl_nv_d3d9_sharing
                                        cl_nv_compiler_options
                                        cl_nv_device_attribute_query
                                        cl_nv_pragma_unroll
                                        cl_khr_global_int32_base_atomics
                                        cl_khr_global_int32_extended_atomics
                                        cl_khr_local_int32_base_atomics
                                        cl_khr_local_int32_extended_atomics
                                        cl_khr_fp64


  CL_DEVICE_COMPUTE_CAPABILITY_NV:      2.1
  NUMBER OF MULTIPROCESSORS:            1
  NUMBER OF CUDA CORES:                 48
  CL_DEVICE_REGISTERS_PER_BLOCK_NV:     32768
  CL_DEVICE_WARP_SIZE_NV:               32
  CL_DEVICE_GPU_OVERLAP_NV:             CL_TRUE
  CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:     CL_TRUE
  CL_DEVICE_INTEGRATED_MEMORY_NV:       CL_FALSE
  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>  CHAR 1, SHORT 1, INT 1, LONG 1, FLOAT 1,
 DOUBLE 1


  ---------------------------------
  2D Image Formats Supported (71)
  ---------------------------------
  #     Channel Order   Channel Type

  1     CL_R            CL_FLOAT
  2     CL_R            CL_HALF_FLOAT
  3     CL_R            CL_UNORM_INT8
  4     CL_R            CL_UNORM_INT16
  5     CL_R            CL_SNORM_INT16
  6     CL_R            CL_SIGNED_INT8
  7     CL_R            CL_SIGNED_INT16
  8     CL_R            CL_SIGNED_INT32
  9     CL_R            CL_UNSIGNED_INT8
  10    CL_R            CL_UNSIGNED_INT16
  11    CL_R            CL_UNSIGNED_INT32
  12    CL_A            CL_FLOAT
  13    CL_A            CL_HALF_FLOAT
  14    CL_A            CL_UNORM_INT8
  15    CL_A            CL_UNORM_INT16
  16    CL_A            CL_SNORM_INT16
  17    CL_A            CL_SIGNED_INT8
  18    CL_A            CL_SIGNED_INT16
  19    CL_A            CL_SIGNED_INT32
  20    CL_A            CL_UNSIGNED_INT8
  21    CL_A            CL_UNSIGNED_INT16
  22    CL_A            CL_UNSIGNED_INT32
  23    CL_RG           CL_FLOAT
  24    CL_RG           CL_HALF_FLOAT
  25    CL_RG           CL_UNORM_INT8
  26    CL_RG           CL_UNORM_INT16
  27    CL_RG           CL_SNORM_INT16
  28    CL_RG           CL_SIGNED_INT8
  29    CL_RG           CL_SIGNED_INT16
  30    CL_RG           CL_SIGNED_INT32
  31    CL_RG           CL_UNSIGNED_INT8
  32    CL_RG           CL_UNSIGNED_INT16
  33    CL_RG           CL_UNSIGNED_INT32
  34    CL_RA           CL_FLOAT
  35    CL_RA           CL_HALF_FLOAT
  36    CL_RA           CL_UNORM_INT8
  37    CL_RA           CL_UNORM_INT16
  38    CL_RA           CL_SNORM_INT16
  39    CL_RA           CL_SIGNED_INT8
  40    CL_RA           CL_SIGNED_INT16
  41    CL_RA           CL_SIGNED_INT32
  42    CL_RA           CL_UNSIGNED_INT8
  43    CL_RA           CL_UNSIGNED_INT16
  44    CL_RA           CL_UNSIGNED_INT32
  45    CL_RGBA         CL_FLOAT
  46    CL_RGBA         CL_HALF_FLOAT
  47    CL_RGBA         CL_UNORM_INT8
  48    CL_RGBA         CL_UNORM_INT16
  49    CL_RGBA         CL_SNORM_INT16
  50    CL_RGBA         CL_SIGNED_INT8
  51    CL_RGBA         CL_SIGNED_INT16
  52    CL_RGBA         CL_SIGNED_INT32
  53    CL_RGBA         CL_UNSIGNED_INT8
  54    CL_RGBA         CL_UNSIGNED_INT16
  55    CL_RGBA         CL_UNSIGNED_INT32
  56    CL_BGRA         CL_UNORM_INT8
  57    CL_BGRA         CL_SIGNED_INT8
  58    CL_BGRA         CL_UNSIGNED_INT8
  59    CL_ARGB         CL_UNORM_INT8
  60    CL_ARGB         CL_SIGNED_INT8
  61    CL_ARGB         CL_UNSIGNED_INT8
  62    CL_INTENSITY    CL_FLOAT
  63    CL_INTENSITY    CL_HALF_FLOAT
  64    CL_INTENSITY    CL_UNORM_INT8
  65    CL_INTENSITY    CL_UNORM_INT16
  66    CL_INTENSITY    CL_SNORM_INT16
  67    CL_LUMINANCE    CL_FLOAT
  68    CL_LUMINANCE    CL_HALF_FLOAT
  69    CL_LUMINANCE    CL_UNORM_INT8
  70    CL_LUMINANCE    CL_UNORM_INT16
  71    CL_LUMINANCE    CL_SNORM_INT16

  ---------------------------------
  3D Image Formats Supported (71)
  ---------------------------------
  #     Channel Order   Channel Type

  1     CL_R            CL_FLOAT
  2     CL_R            CL_HALF_FLOAT
  3     CL_R            CL_UNORM_INT8
  4     CL_R            CL_UNORM_INT16
  5     CL_R            CL_SNORM_INT16
  6     CL_R            CL_SIGNED_INT8
  7     CL_R            CL_SIGNED_INT16
  8     CL_R            CL_SIGNED_INT32
  9     CL_R            CL_UNSIGNED_INT8
  10    CL_R            CL_UNSIGNED_INT16
  11    CL_R            CL_UNSIGNED_INT32
  12    CL_A            CL_FLOAT
  13    CL_A            CL_HALF_FLOAT
  14    CL_A            CL_UNORM_INT8
  15    CL_A            CL_UNORM_INT16
  16    CL_A            CL_SNORM_INT16
  17    CL_A            CL_SIGNED_INT8
  18    CL_A            CL_SIGNED_INT16
  19    CL_A            CL_SIGNED_INT32
  20    CL_A            CL_UNSIGNED_INT8
  21    CL_A            CL_UNSIGNED_INT16
  22    CL_A            CL_UNSIGNED_INT32
  23    CL_RG           CL_FLOAT
  24    CL_RG           CL_HALF_FLOAT
  25    CL_RG           CL_UNORM_INT8
  26    CL_RG           CL_UNORM_INT16
  27    CL_RG           CL_SNORM_INT16
  28    CL_RG           CL_SIGNED_INT8
  29    CL_RG           CL_SIGNED_INT16
  30    CL_RG           CL_SIGNED_INT32
  31    CL_RG           CL_UNSIGNED_INT8
  32    CL_RG           CL_UNSIGNED_INT16
  33    CL_RG           CL_UNSIGNED_INT32
  34    CL_RA           CL_FLOAT
  35    CL_RA           CL_HALF_FLOAT
  36    CL_RA           CL_UNORM_INT8
  37    CL_RA           CL_UNORM_INT16
  38    CL_RA           CL_SNORM_INT16
  39    CL_RA           CL_SIGNED_INT8
  40    CL_RA           CL_SIGNED_INT16
  41    CL_RA           CL_SIGNED_INT32
  42    CL_RA           CL_UNSIGNED_INT8
  43    CL_RA           CL_UNSIGNED_INT16
  44    CL_RA           CL_UNSIGNED_INT32
  45    CL_RGBA         CL_FLOAT
  46    CL_RGBA         CL_HALF_FLOAT
  47    CL_RGBA         CL_UNORM_INT8
  48    CL_RGBA         CL_UNORM_INT16
  49    CL_RGBA         CL_SNORM_INT16
  50    CL_RGBA         CL_SIGNED_INT8
  51    CL_RGBA         CL_SIGNED_INT16
  52    CL_RGBA         CL_SIGNED_INT32
  53    CL_RGBA         CL_UNSIGNED_INT8
  54    CL_RGBA         CL_UNSIGNED_INT16
  55    CL_RGBA         CL_UNSIGNED_INT32
  56    CL_BGRA         CL_UNORM_INT8
  57    CL_BGRA         CL_SIGNED_INT8
  58    CL_BGRA         CL_UNSIGNED_INT8
  59    CL_ARGB         CL_UNORM_INT8
  60    CL_ARGB         CL_SIGNED_INT8
  61    CL_ARGB         CL_UNSIGNED_INT8
  62    CL_INTENSITY    CL_FLOAT
  63    CL_INTENSITY    CL_HALF_FLOAT
  64    CL_INTENSITY    CL_UNORM_INT8
  65    CL_INTENSITY    CL_UNORM_INT16
  66    CL_INTENSITY    CL_SNORM_INT16
  67    CL_LUMINANCE    CL_FLOAT
  68    CL_LUMINANCE    CL_HALF_FLOAT
  69    CL_LUMINANCE    CL_UNORM_INT8
  70    CL_LUMINANCE    CL_UNORM_INT16
  71    CL_LUMINANCE    CL_SNORM_INT16

oclDeviceQuery, Platform Name = NVIDIA CUDA, Platform Version = OpenCL 1.0 CUDA
3.2.1, SDK Revision = 7027912, NumDevs = 1, Device = NVS 4200M

System Info:

 Local Time/Date = 13:19:53, 5/25/2013
 CPU Arch: 0
 CPU Level: 6
 # of CPU processors: 4
 Windows Build: 2600
 Windows Ver: 5.1


[oclDeviceQuery.exe] test results...
PASSED

Press ENTER to exit...

View Code

6. opencl的带宽测试程序

PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclBandwidthTest.exe Starting...

Running on...

NVS 4200M

Quick Mode

Host to Device Bandwidth, 1 Device(s), Paged memory, direct access
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     4131.2

Device to Host Bandwidth, 1 Device(s), Paged memory, direct access
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     3485.6

Device to Device Bandwidth, 1 Device(s)
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     8901.4

[oclBandwidthTest.exe] test results...
PASSED

View Code

7 关于安装driver失败，在stackoverflow上有个人说

http://stackoverflow.com/questions/11913320/installing-cuda-nvidia-graphic-driver-failed
I have a VAIO too and I had the same problem. Don't download notebook version, try Desktop version of Nvidia Driver. I also had to disable my another Graphic card (Intel). It worked for me.

View Code

不过也有人说要修改inf文件才行

Unfortunately, there are many NVIDIA GPUs for which the driver from the NVIDIA website will not install (especially for GPU versions that are specifically OEM'd for Sony, Lenovo, etc and the OEM wants to control the driver experience). This is most likely the case for you.

In those cases, you can edit the .inf file to add your GPU into the list of GPUs for which the driver will install. However, it is a bit tricky and typically requires editing 3 different sections of the INF file. You can search around for details on how to mod NVIDIA inf files; there are a number of sites that do that.

Of course, you have to have the appropriate CUDA driver before you can run CUDA stuff. So first things first... you've gotta get the driver installed.

View Code

这些俺暂时没有测试过是否有效

8. 既然cuda用不了，而opencl貌似可以

那俺还是转移到opencl上吧，首先测试一个例子 http://www.kimicat.com/opencl-1/opencl-jiao-xue-yi

// OpenCL tutorial 1

#include <iostream>
#include <string>
#include <vector>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif


int main()
{

    cl_int err;
    cl_uint num;
    err = clGetPlatformIDs(0, 0, &num);
    if(err != CL_SUCCESS) {

        std::cerr << "Unable to get platforms\n";

        return 0;

    }

    std::vector<cl_platform_id> platforms(num);
    err = clGetPlatformIDs(num, &platforms[0], &num);
    if(err != CL_SUCCESS) {

        std::cerr << "Unable to get platform ID\n";

        return 0;

    }

    cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[0]), 0 };

    cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, NULL);

    if(context == 0) {

        std::cerr << "Can't create OpenCL context\n";

        return 0;

    }


    size_t cb;

    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);

    std::vector<cl_device_id> devices(cb / sizeof(cl_device_id));

    clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, &devices[0], 0);


    clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &cb);

    std::string devname;

    devname.resize(cb);

    clGetDeviceInfo(devices[0], CL_DEVICE_NAME, cb, &devname[0], 0);

    std::cout << "Device: " << devname.c_str() << "\n";


    clReleaseContext(context);

    return 0;

}

View Code

注意设置好路径

F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\inc

F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\lib\Win32

正常编译运行成功！

Device: NVS 4200M
请按任意键继续. . .

View Code

9. 注：后来俺在bios里面把intergrated GPU disable后，成功安装307.83-quadro-notebook-winxp-32bit-international-whql.exe
安装cuda_5.0.35_winxp_general_32-3.msi还是有错误
但是执行cuda程序貌似都正常了

以下是bandwidthTest.exe测试结果，比opencl版本的快了很多

[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: NVS 4200M
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     6241.5

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     6302.9

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     10330.3

View Code

devicequery结果

ples\v5.0\bin\win32\Release\deviceQuery.exe Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVS 4200M"
  CUDA Driver Version / Runtime Version          5.0 / 5.0
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1024 MBytes (1073283072 bytes)
  ( 1) Multiprocessors x ( 48) CUDA Cores/MP:    48 CUDA Cores
  GPU Clock rate:                                1620 MHz (1.62 GHz)
  Memory Clock rate:                             800 Mhz
  Memory Bus Width:                              64-bit
  L2 Cache Size:                                 65536 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3
D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16
384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Mo
del)
  Device supports Unified Addressing (UVA):      No
  Device PCI Bus ID / PCI location ID:           1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simu
ltaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.0, CUDA Runtime Versi
on = 5.0, NumDevs = 1, Device0 = NVS 4200M

View Code

Published on May 25, 2013 in categories