2013-05-14/0002775000017000003100000000000012144650146011246 5ustar borchertsai2013-05-14/simpson.cu0000644000017000003100000000530012144261661013261 0ustar borchertsai#include #include #include #include using namespace std; typedef double Real; /* return unique id within a block */ __device__ int get_threadid() { return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; } /* return block id a thread is associated to */ __device__ int get_blockid() { return blockIdx.x + blockIdx.y * gridDim.x; } /* return number of threads per block */ __device__ int get_nofthreads_per_block() { return blockDim.x * blockDim.y * blockDim.z; } /* return number of blocks */ __device__ int get_nofblocks() { return gridDim.x * gridDim.y; } /* return total number of threads */ __device__ int get_nofthreads() { return get_nofthreads_per_block() * get_nofblocks(); } /* return id which is unique throughout all threads */ __device__ int get_id() { return get_blockid() * blockDim.x * blockDim.y * blockDim.z + get_threadid(); } int max_threads_per_block() { int device; cudaGetDevice(&device); struct cudaDeviceProp device_prop; cudaGetDeviceProperties(&device_prop, device); return device_prop.maxThreadsPerBlock; } // to be integrated function __device__ Real f(Real x) { return 4 / (1 + x*x); } // numerical integration according to the Simpson rule // for f over the interval [a,b] __global__ void simpson(Real a, Real b, Real* sums) { const int N = get_nofthreads(); const int i = get_id(); Real xleft = a + (b - a) / N * i; Real xright = xleft + (b - a) / N; Real xmid = (xleft + xright) / 2; sums[i] = (xright - xleft) / 6 * (f(xleft) + 4 * f(xmid) + f(xright)); } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [# intervals]" << endl; exit(1); } int main(int argc, char** argv) { const Real a = 0; const Real b = 1; int N = 8192; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) usage(); int blocksize = max_threads_per_block(); if (blocksize > N) { blocksize = N; } else { if (N % blocksize != 0) { cerr << cmdname << ": please select a multiple of " << blocksize << endl; exit(1); } } int nof_blocks = N / blocksize; dim3 blockdim(blocksize, 1, 1); dim3 griddim(nof_blocks, 1); Real sums[N]; Real* cuda_sums; cudaMalloc((void**)&cuda_sums, N * sizeof(Real)); simpson<<>>(a, b, cuda_sums); cudaMemcpy(sums, cuda_sums, N * sizeof(Real), cudaMemcpyDeviceToHost); cudaFree(cuda_sums); double sum = 0; for (int i = 0; i < N; ++i) { sum += sums[i]; } cout << setprecision(14) << sum << endl; cout << setprecision(14) << M_PI << endl; } 2013-05-14/jacobi.cu0000644000017000003100000000363612144261715013032 0ustar borchertsai#include #include #include #include using namespace std; // M_E and M_PI are not part of ISO C++ #ifndef M_E #define M_E 2.7182818284590452354 #endif #ifndef M_PI #define M_PI 3.14159265358979323846 #endif typedef double Real; #define BLOCK_SIZE 16 typedef Real Matrix[BLOCK_SIZE+2][BLOCK_SIZE+2]; __global__ void jacobi(Matrix A, int nofiterations) { int i = threadIdx.x + 1; int j = threadIdx.y + 1; for (int it = 0; it < nofiterations; ++it) { Real Aij = 0.25 * (A[i-1][j] + A[i][j-1] + A[i][j+1] + A[i+1][j]); __syncthreads(); A[i][j] = Aij; __syncthreads(); } } void initialize_A(Matrix A) { const unsigned int N = BLOCK_SIZE + 1; const static Real E_POWER_MINUS_PI = powf(M_E, -M_PI); for (int i = 0; i <= N; ++i) { for (int j = 0; j <= N; ++j) { if (j == 0) { A[i][j] = sinf(M_PI * ((Real)i/N)); } else if (j == N) { A[i][j] = sinf(M_PI * ((Real)i/N)) * E_POWER_MINUS_PI; } else { A[i][j] = 0; } } } } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [# iterations]" << endl; exit(1); } int main(int argc, char** argv) { int nofiterations = 10000; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> nofiterations) || nofiterations <= 0) usage(); } if (argc > 0) usage(); Matrix A; initialize_A(A); Matrix* cuda_A; cudaMalloc((void**)&cuda_A, sizeof(Matrix)); cudaMemcpy(cuda_A, A, sizeof(Matrix), cudaMemcpyHostToDevice); dim3 block(BLOCK_SIZE, BLOCK_SIZE); jacobi<<<1, block>>>(*cuda_A, nofiterations); cudaMemcpy(A, cuda_A, sizeof(Matrix), cudaMemcpyDeviceToHost); cudaFree(cuda_A); cout << BLOCK_SIZE + 2 << endl; for (int i = 0; i < BLOCK_SIZE + 2; ++i) { for (int j = 0; j < BLOCK_SIZE + 2; ++j) { cout << " " << A[i][j]; } cout << endl; } } 2013-05-14/mmm.cu0000664000017000003100000001113312144261745012365 0ustar borchertsai#include #include #include #include #include #include using namespace std; typedef double Real; #define BLOCK_SIZE 16 struct Matrix { unsigned int N; Real* data; bool cuda_allocated; Real* cuda_data; Matrix() : N(0), data(0), cuda_allocated(false), cuda_data(0) { } ~Matrix() { if (data) delete data; if (cuda_allocated) release_cuda_data(); } bool copy_to_gpu() { if (!cuda_allocated) { if (!allocate_cuda_data()) return false; } return cudaMemcpy(cuda_data, data, N * N * sizeof(Real), cudaMemcpyHostToDevice) == cudaSuccess; } bool copy_from_gpu() { assert(cuda_allocated); return cudaMemcpy(data, cuda_data, N * N * sizeof(Real), cudaMemcpyDeviceToHost) == cudaSuccess; } bool allocate_cuda_data() { if (cuda_allocated) return true; Real* cudap; if (cudaMalloc((void**)&cudap, N * N * sizeof(Real)) != cudaSuccess) { return false; } cuda_data = cudap; cuda_allocated = true; return true; } void release_cuda_data() { if (cuda_data) { cudaFree(cuda_data); cuda_data = 0; } } bool resize(unsigned int N_) { if (N == N_) return true; Real* rp = new Real[N_ * N_]; if (!rp) return false; if (data) delete data; release_cuda_data(); data = rp; N = N_; return true; } Real& operator()(unsigned int i, unsigned int j) { return data[i*N + j]; } const Real& operator()(unsigned int i, unsigned int j) const { return data[i*N + j]; } }; // input operator for square matrices that expects // an input sequence consisting of N, followed by N*N // elements of the matrix in row-major order istream& operator>>(istream& in, Matrix& matrix) { unsigned int N; if (!(in >> N)) return in; if (N == 0 || !matrix.resize(N)) { in.setstate(ios::failbit); return in; } matrix.N = N; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { if (!(in >> matrix(i,j))) return in; } } return in; } ostream& operator<<(ostream& out, Matrix& matrix) { unsigned int N = matrix.N; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { if (!(out << " " << matrix(i, j))) return out; } cout << endl; } return out; } // fetch a matrix from an input file bool read_matrix(char* filename, Matrix& matrix) { fstream in(filename); return in >> matrix; } #define ELEMENT(m,i,j) ((m)[(i) * stride + (j)]) __global__ void mmm(Real* a, Real* b, Real* c) { __shared__ Real ablock[BLOCK_SIZE][BLOCK_SIZE]; __shared__ Real bblock[BLOCK_SIZE][BLOCK_SIZE]; unsigned int stride = gridDim.y * BLOCK_SIZE; unsigned int row = blockIdx.y * BLOCK_SIZE + threadIdx.y; unsigned int col = blockIdx.x * BLOCK_SIZE + threadIdx.x; Real sum = 0; for (int round = 0; round < gridDim.y; ++round) { ablock[threadIdx.y][threadIdx.x] = ELEMENT(a, row, round*BLOCK_SIZE + threadIdx.x); bblock[threadIdx.y][threadIdx.x] = ELEMENT(b, round*BLOCK_SIZE + threadIdx.y, col); __syncthreads(); #pragma unroll for (int k = 0; k < BLOCK_SIZE; ++k) { sum += ablock[threadIdx.y][k] * bblock[k][threadIdx.x]; } __syncthreads(); } ELEMENT(c, row, col) = sum; } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " A_file B_file" << endl; exit(1); } int main(int argc, char** argv) { cmdname = *argv++; --argc; if (argc != 2) usage(); Matrix A; if (!read_matrix(*argv++, A)) usage(); --argc; Matrix B; if (!read_matrix(*argv++, B)) usage(); --argc; cout << "A = " << endl << A << endl; cout << "B = " << endl << B << endl; if (A.N != B.N) { cerr << cmdname << ": sizes of the matrices do not match" << endl; exit(1); } if (A.N % BLOCK_SIZE) { cerr << cmdname << ": size of matrices is not a multiply of " << BLOCK_SIZE << endl; exit(1); } A.copy_to_gpu(); B.copy_to_gpu(); Matrix C; C.resize(A.N); C.allocate_cuda_data(); dim3 block(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(A.N / BLOCK_SIZE, A.N / BLOCK_SIZE); struct timeval tbuf1; gettimeofday(&tbuf1, 0); mmm<<>>(A.cuda_data, B.cuda_data, C.cuda_data); struct timeval tbuf2; gettimeofday(&tbuf2, 0); long int usec = (tbuf2.tv_sec - tbuf1.tv_sec) * 1000000 + (tbuf2.tv_usec - tbuf1.tv_usec); double timeInMillisecs = (double) usec / (double) 1000; cerr << "GPU time in ms: " << timeInMillisecs << endl; C.copy_from_gpu(); cout << "C = " << endl << setprecision(14) << C << endl; } 2013-05-14/mmm-ab.cu0000664000017000003100000001027612144262017012745 0ustar borchertsai#include #include #include #include #include #include using namespace std; typedef double Real; #define BLOCK_SIZE 16 struct Matrix { unsigned int N; Real* data; bool cuda_allocated; Real* cuda_data; Matrix() : N(0), data(0), cuda_allocated(false), cuda_data(0) { } ~Matrix() { if (data) delete data; if (cuda_allocated) release_cuda_data(); } bool copy_to_gpu() { if (!cuda_allocated) { if (!allocate_cuda_data()) return false; } return cudaMemcpy(cuda_data, data, N * N * sizeof(Real), cudaMemcpyHostToDevice) == cudaSuccess; } bool copy_from_gpu() { assert(cuda_allocated); return cudaMemcpy(data, cuda_data, N * N * sizeof(Real), cudaMemcpyDeviceToHost) == cudaSuccess; } bool allocate_cuda_data() { if (cuda_allocated) return true; Real* cudap; if (cudaMalloc((void**)&cudap, N * N * sizeof(Real)) != cudaSuccess) { return false; } cuda_data = cudap; cuda_allocated = true; return true; } void release_cuda_data() { if (cuda_data) { cudaFree(cuda_data); cuda_data = 0; } } bool resize(unsigned int N_) { if (N == N_) return true; Real* rp = new Real[N_ * N_]; if (!rp) return false; if (data) delete data; release_cuda_data(); data = rp; N = N_; return true; } Real& operator()(unsigned int i, unsigned int j) { return data[i*N + j]; } const Real& operator()(unsigned int i, unsigned int j) const { return data[i*N + j]; } }; // input operator for square matrices that expects // an input sequence consisting of N, followed by N*N // elements of the matrix in row-major order istream& operator>>(istream& in, Matrix& matrix) { unsigned int N; if (!(in >> N)) return in; if (N == 0 || !matrix.resize(N)) { in.setstate(ios::failbit); return in; } matrix.N = N; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { if (!(in >> matrix(i,j))) return in; } } return in; } ostream& operator<<(ostream& out, Matrix& matrix) { unsigned int N = matrix.N; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { if (!(out << " " << matrix(i, j))) return out; } cout << endl; } return out; } // fetch a matrix from an input file bool read_matrix(char* filename, Matrix& matrix) { fstream in(filename); return in >> matrix; } #define ELEMENT(m,i,j) ((m)[(i) * stride + (j)]) __global__ void mmm(Real* a, Real* b, Real* c) { unsigned int stride = gridDim.y * BLOCK_SIZE; unsigned int row = blockIdx.y * BLOCK_SIZE + threadIdx.y; unsigned int col = blockIdx.x * BLOCK_SIZE + threadIdx.x; Real sum = 0; for (int k = 0; k < BLOCK_SIZE * gridDim.y; ++k) { sum += ELEMENT(a, row, k) * ELEMENT(b, k, col); } ELEMENT(c, row, col) = sum; } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " A_file B_file" << endl; exit(1); } int main(int argc, char** argv) { cmdname = *argv++; --argc; if (argc != 2) usage(); Matrix A; if (!read_matrix(*argv++, A)) usage(); --argc; Matrix B; if (!read_matrix(*argv++, B)) usage(); --argc; cout << "A = " << endl << A << endl; cout << "B = " << endl << B << endl; if (A.N != B.N) { cerr << cmdname << ": sizes of the matrices do not match" << endl; exit(1); } if (A.N % BLOCK_SIZE) { cerr << cmdname << ": size of matrices is not a multiply of " << BLOCK_SIZE << endl; exit(1); } A.copy_to_gpu(); B.copy_to_gpu(); Matrix C; C.resize(A.N); C.allocate_cuda_data(); dim3 block(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(A.N / BLOCK_SIZE, A.N / BLOCK_SIZE); struct timeval tbuf1; gettimeofday(&tbuf1, 0); mmm<<>>(A.cuda_data, B.cuda_data, C.cuda_data); struct timeval tbuf2; gettimeofday(&tbuf2, 0); long int usec = (tbuf2.tv_sec - tbuf1.tv_sec) * 1000000 + (tbuf2.tv_usec - tbuf1.tv_usec); double timeInMillisecs = (double) usec / (double) 1000; cerr << "GPU time in ms: " << timeInMillisecs << endl; C.copy_from_gpu(); cout << "C = " << endl << setprecision(14) << C << endl; } 2013-05-14/properties.cu0000644000017000003100000000277712144650050013776 0ustar borchertsai#include #define YESNO(option) ((option)? "yes": "no") int main() { int device; cudaGetDevice(&device); int device_count; cudaGetDeviceCount(&device_count); struct cudaDeviceProp device_prop; cudaGetDeviceProperties(&device_prop, device); if (device_count > 1) { printf("device %d selected out of %d devices:\n", device, device_count); } else { printf("one device present:\n"); } printf("name: %s\n", device_prop.name); printf("compute capability: %d.%d\n", device_prop.major, device_prop.minor); printf("total global memory: %d\n", device_prop.totalGlobalMem); printf("total shared memory per block: %d\n", device_prop.sharedMemPerBlock); printf("registers per block: %d\n", device_prop.regsPerBlock); printf("warp size: %d\n", device_prop.warpSize); printf("mem pitch: %d\n", device_prop.memPitch); printf("max threads per block: %d\n", device_prop.maxThreadsPerBlock); printf("max threads dim: %d %d %d\n", device_prop.maxThreadsDim[0], device_prop.maxThreadsDim[1], device_prop.maxThreadsDim[2]); printf("max grid dim: %d %d %d\n", device_prop.maxGridSize[0], device_prop.maxGridSize[1], device_prop.maxGridSize[2]); printf("multi processor count: %d\n", device_prop.multiProcessorCount); printf("kernel exec timeout enabled: %s\n", YESNO(device_prop.kernelExecTimeoutEnabled)); printf("integrated: %s\n", YESNO(device_prop.integrated)); printf("can map host memory: %s\n", YESNO(device_prop.canMapHostMemory)); }