code/code/gpukernel.py at master · rmlopes/code

151 lines (123 loc) · 4.72 KB
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import random
from functools import partial
def getkernel(blocksize = 16,wb = 17):
    mod = SourceModule(
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation and
 * any modifications thereto.  Any use, reproduction, disclosure, or distribution
 * of this software and related documentation without an express license
 * agreement from NVIDIA Corporation is strictly prohibited.
/* Matrix multiplication: C = A * B.
 * Device code.
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
        // Thread block size
#define BLOCK_SIZE %i
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA BLOCK_SIZE //(3 * BLOCK_SIZE) // Matrix A width
#define HA 1 //(5 * BLOCK_SIZE) // Matrix A height
#define WB %i //(8 * BLOCK_SIZE) // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
__global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;
    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    // Index of the first sub-matrix of A processed by the block
    int aBegin = wA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + wA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + wA * ty + tx];
        BS(ty, tx) = B[b + wB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        __syncthreads();
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;
#endif // #ifndef _MATRIXMUL_KERNEL_H_
        """%(blocksize, wb))
    BLOCK_SIZE = (blocksize,blocksize,1)
    GRID_SIZE = (wb,1)
    return partial(mod.get_function("matrixMul"),
                   block = BLOCK_SIZE,
                   grid = GRID_SIZE)
if __name__ == '__main__':
    SIZEA = 22 #maximum by trial and error
    SIZEB = 256
    a = np.array([random.random() for i in range(SIZEA*2)], dtype = np.float32)
    b = np.array([[random.random() for i in range(SIZEB)]
                  for j in range(SIZEA*2)], dtype = np.float32)
    dest = np.zeros_like(b[0])
    gpu_a = gpuarray.to_gpu(a)
    gpu_b = gpuarray.to_gpu(b)
    gpudest = gpuarray.to_gpu(dest)
    fun = getkernel( SIZEA, SIZEB )
    fun( gpudest, gpu_a, gpu_b, np.int32(SIZEA*2), np.int32(SIZEB))
    print (gpudest.get() - np.dot(a,b))[::7]
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

gpukernel.py

Latest commit

History

gpukernel.py

File metadata and controls