2013-04-23/0002775000017000003100000000000012135712520011240 5ustar borchertsai2013-04-23/pointer-chasing.cpp0000664000017000003100000000721212135712172015041 0ustar borchertsai// small utility to measure cache and memory read access times // afb 10/2008 // // Usage: // pointer-chasing [memsize in kb [count]] // where // memsize in kb is the size of the memory buffer // which is accessed in a randomized // and sequentialized way // count total number of memory accesses // from the memory buffer // The memory buffer is organized as an array of pointers where // - all pointers point into the very same buffer and where // - beginning from any pointer all other pointers are // referenced directly or indirectly, and where // - the pointer chain is randomized // // Once such a memory buffer has been set up, we measure the time of // // void** p = (void**) memory[0]; // while (count-- > 0) { // p = (void**) *p; // } // // The "p = (void**) *p" construct causes all memory accesses to // be serialized, i.e. the next access can only be started whenever // the previous is finished. #include #include #include #include #include #include using namespace std; volatile void* global; // to defeat optimizations // setup a memory buffer of the given size and access all // memory cells randomly count times; // the number of clock ticks passed for chasing the // pointers in the memory buffer is returned (this // does not include the setup); // the global pointer is updated to defeat optimizations clock_t chase_pointers(unsigned int size, unsigned int count) { unsigned int len = size / sizeof(void*); void** memory = new void*[len]; // shuffle indices int* indices = new int[len]; for (int i = 0; i < len; ++i) { indices[i] = i; } for (int i = 0; i < len-1; ++i) { int j = i + lrand48() % (len - i); if (i != j) { int tmp = indices[i]; indices[i] = indices[j]; indices[j] = tmp; } } // fill memory with pointer references for (int i = 1; i < len; ++i) { memory[indices[i-1]] = (void*) &memory[indices[i]]; } memory[indices[len-1]] = (void*) &memory[indices[0]]; delete[] indices; // sleep(3); // for cputrack struct tms timebuf1; times(&timebuf1); // chase the pointers count times void** p = (void**) memory[0]; while (count-- > 0) { p = (void**) *p; } global = *p; struct tms timebuf2; times(&timebuf2); clock_t ticks = timebuf2.tms_utime - timebuf1.tms_utime; // sleep(3); // for cputrack delete[] memory; return ticks; } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [memsize in kb [count]]" << endl; exit(1); } int main(int argc, char** argv) { unsigned int memsize = 1024; unsigned int count; cmdname = *argv++; --argc; // first optional argument is the memsize in kb if (argc > 0) { unsigned int kb; istringstream arg(*argv++); --argc; if (!(arg >> kb) || kb <= 0) usage(); cout << "memsize in kb = " << kb << endl; memsize = kb * 1024; } // second optional argument is count if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> count) || count <= 0) usage(); } else { // compute some reasonable default value for count count = memsize * 16; unsigned int min = 1024 * 1024 * 1024; if (count < min) { count = min; } } if (argc > 0) usage(); clock_t ticks = chase_pointers(memsize, count); #ifndef CLK_TCK int CLK_TCK = sysconf(_SC_CLK_TCK); #endif double avgTimeInNanosecs = (double) ticks / CLK_TCK * 1000000000 / (double) count; cout << "avg access time in ns: " << avgTimeInNanosecs << endl; } 2013-04-23/vectors.cpp0000664000017000003100000000453512135712301013433 0ustar borchertsai#include #include #include #include #include using namespace std; // y = y + alpha * x void axpy(int n, double alpha, const double* x, int incX, double* y, int incY) { for (int i = 0; i < n; ++i, x += incX, y += incY) { *y += alpha * *x; } } class AxpyThread { public: AxpyThread(int _n, double _alpha, double* _x, int _incX, double* _y, int _incY) : n(_n), alpha(_alpha), x(_x), incX(_incX), y(_y), incY(_incY) { } void operator()() { axpy(n, alpha, x, incX, y, incY); } private: int n; double alpha; double* x; int incX; double* y; int incY; }; void mt_axpy(int n, double alpha, double* x, int incX, double* y, int incY, int nofthreads) { assert(n > 0 && nofthreads > 0); thread axpy_thread[nofthreads]; // spawn threads and pass parameters int chunk = n / nofthreads; int remainder = n % nofthreads; int nextX = 0; int nextY = 0; for (int i = 0; i < nofthreads; ++i) { int len = chunk; if (i < remainder) ++len; axpy_thread[i] = thread(AxpyThread(len, alpha, x + nextX * incX, incX, y + nextY * incY, incY)); nextX += len; nextY += len; } // wait for all threads to finish for (int i = 0; i < nofthreads; ++i) { axpy_thread[i].join(); } } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [len [# threads]]" << endl; exit(1); } void fill_vector(double* a, int len) { static double val = 0; while (len > 0) { *a++ = val; val = val + 1; --len; } } int main(int argc, char** argv) { const double a = 0; const double b = 1; int N = 10000; int nofthreads = 10; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> nofthreads) || nofthreads <= 0) usage(); } if (argc > 0) usage(); double* x = new double[N]; fill_vector(x, N); double* y = new double[N]; fill_vector(y, N); double alpha = 2; // axpy(N, alpha, x, 1, y, 1); mt_axpy(N, alpha, x, 1, y, 1, nofthreads); #ifdef OUTPUT for (int i = 0; i < N; ++i) { if (i % 8 == 0) { cout << endl; } else { cout << " "; } cout << y[i]; } cout << endl; #endif } 2013-04-23/bad-vectors.cpp0000664000017000003100000000456112135712303014160 0ustar borchertsai#include #include #include #include #include using namespace std; // y = y + alpha * x void axpy(int n, double alpha, const double* x, int incX, double* y, int incY) { for (int i = 0; i < n; ++i, x += incX, y += incY) { *y += alpha * *x; } } class AxpyThread { public: AxpyThread(int _n, double _alpha, double* _x, int _incX, double* _y, int _incY) : n(_n), alpha(_alpha), x(_x), incX(_incX), y(_y), incY(_incY) { } void operator()() { axpy(n, alpha, x, incX, y, incY); } private: int n; double alpha; double* x; int incX; double* y; int incY; }; void mt_axpy(int n, double alpha, double* x, int incX, double* y, int incY, int nofthreads) { assert(n > 0 && nofthreads > 0); thread axpy_thread[nofthreads]; // spawn threads and pass parameters int chunk = n / nofthreads; int remainder = n % nofthreads; int nextX = 0; int nextY = 0; for (int i = 0; i < nofthreads; ++i) { int len = chunk; if (i < remainder) ++len; axpy_thread[i] = thread(AxpyThread(len, alpha, x + i * incX, incX * nofthreads, y + i * incY, incY * nofthreads)); nextX += len; nextY += len; } // wait for all threads to finish for (int i = 0; i < nofthreads; ++i) { axpy_thread[i].join(); } } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [len [# threads]]" << endl; exit(1); } void fill_vector(double* a, int len) { static double val = 0; while (len > 0) { *a++ = val; val = val + 1; --len; } } int main(int argc, char** argv) { const double a = 0; const double b = 1; int N = 10000; int nofthreads = 10; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> nofthreads) || nofthreads <= 0) usage(); } if (argc > 0) usage(); double* x = new double[N]; fill_vector(x, N); double* y = new double[N]; fill_vector(y, N); double alpha = 2; // axpy(N, alpha, x, 1, y, 1); mt_axpy(N, alpha, x, 1, y, 1, nofthreads); #ifdef OUTPUT for (int i = 0; i < N; ++i) { if (i % 8 == 0) { cout << endl; } else { cout << " "; } cout << y[i]; } cout << endl; #endif } 2013-04-23/openmp-vectors.cpp0000664000017000003100000000221212135712373014726 0ustar borchertsai#include #include #include #include #include using namespace std; // y = y + alpha * x void axpy(int n, double alpha, const double* x, int incX, double* y, int incY) { #pragma omp parallel for for (int i = 0; i < n; ++i) { y[i*incY] += alpha * x[i*incX]; } } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [len]" << endl; exit(1); } void fill_vector(double* a, int len) { static double val = 0; while (len > 0) { *a++ = val; val = val + 1; --len; } } int main(int argc, char** argv) { const double a = 0; const double b = 1; int N = 10000; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) usage(); double* x = new double[N]; fill_vector(x, N); double* y = new double[N]; fill_vector(y, N); double alpha = 2; axpy(N, alpha, x, 1, y, 1); #ifdef OUTPUT for (int i = 0; i < N; ++i) { if (i % 8 == 0) { cout << endl; } else { cout << " "; } cout << y[i]; } cout << endl; #endif } 2013-04-23/omp-simpson.cpp0000664000017000003100000000245512135712505014234 0ustar borchertsai#include #include #include #include #include using namespace std; // to be integrated function double f(double x) { return 4 / (1 + x*x); } // numerical integration according to the Simpson rule // for f over the interval [a,b] using n subintervals double simpson(double (*f)(double), double a, double b, int n) { assert(n > 0 && a <= b); double value = f(a)/2 + f(b)/2; double xleft; double x = a; double sum = 0; #pragma omp parallel for \ private(xleft) \ lastprivate(x) \ reduction(+:sum) for (int i = 1; i < n; ++i) { xleft = a + (i-1) * (b - a) / n; x = a + i * (b - a) / n; sum += f(x) + 2 * f((xleft + x)/2); } value += sum; value += 2 * f((x + b)/2); value *= (b - a) / n / 3; return value; } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [# intervals]" << endl; exit(1); } int main(int argc, char** argv) { const double a = 0; const double b = 1; int N = 10000; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) usage(); double sum = simpson(f, a, b, N); cout << setprecision(14) << sum << endl; cout << setprecision(14) << M_PI << endl; } 2013-04-23/omp-simpson-explicit.cpp0000664000017000003100000000356112135712520016047 0ustar borchertsai#include #include #include #include #include #include using namespace std; // to be integrated function double f(double x) { return 4 / (1 + x*x); } // numerical integration according to the Simpson rule // for f over the interval [a,b] using n subintervals double simpson(double (*f)(double), double a, double b, int n) { assert(n > 0 && a <= b); double value = f(a)/2 + f(b)/2; double xleft; double x = a; for (int i = 1; i < n; ++i) { xleft = x; x = a + i * (b - a) / n; value += f(x) + 2 * f((xleft + x)/2); } value += 2 * f((x + b)/2); value *= (b - a) / n / 3; return value; } double mt_simpson(double (*f)(double), double a, double b, int n) { assert(n > 0 && a <= b); double sum = 0; #pragma omp parallel reduction(+:sum) { int nofthreads = omp_get_num_threads(); int nofintervals = n / nofthreads; int remainder = n % nofthreads; int i = omp_get_thread_num(); int interval = nofintervals * i; int intervals = nofintervals; if (i < remainder) { ++intervals; interval += i; } else { interval += remainder; } double xleft = a + interval * (b - a) / n; double x = a + (interval + intervals) * (b - a) / n; sum += simpson(f, xleft, x, intervals); } return sum; } char* cmdname; void usage() { cerr << "Usage: " << cmdname << " [# intervals]" << endl; exit(1); } int main(int argc, char** argv) { const double a = 0; const double b = 1; int N = 10000; cmdname = *argv++; --argc; if (argc > 0) { istringstream arg(*argv++); --argc; if (!(arg >> N) || N <= 0) usage(); } if (argc > 0) usage(); // double sum = simpson(f, a, b, N); double sum = mt_simpson(f, a, b, N); cout << setprecision(14) << sum << endl; cout << setprecision(14) << M_PI << endl; }