Image by Pixabay from Pexels

Part1: Python vs C++ vs CUDA: Comparing performance speed part 1 (with code)

It’s obvious that AI needs a lot of computing power. Let’s check if we can fully leverage our PCs and MACs.

Python is programming language considered as a very simple, but slow.

C++ is a programming language known as a complicated, but one of the fastest.

CUDA is a parallel computing platform for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs.

Whereas both python and c++ are nicely optimized on single thread on CPU(with possibilities to run in parallel), CUDA is specifically designed to run in parallel.

GPU and the CPU exists because they are designed with different goals in mind. While the CPU is designed to excel at executing a sequence of operations, called a thread, as fast as possible and can execute a few tens of these threads in parallel, the GPU is designed to excel at executing thousands of them in parallel (amortizing the slower single-thread performance to achieve greater throughput).- here is the source

Let’s see the code

python file:

def is_prime(n):
if (n <= 1):
return False
if (n <= 3):
return True
if (n % 2 == 0 or n % 3 == 0):
return False
i = 5
while (i * i <= n):
if (n % i == 0 or n % (i + 2) == 0):
return False
i = i + 6
return True

import time
start = time.time()
prime_list = []

for ii in range(200000):
if is_prime(ii):
prime_list.append(ii)

end = time.time()
print(end - start)
print(len(prime_list))

cpp file:

#include <iostream>
#include <list>
#include <chrono>
using namespace std::chrono;
using namespace std;
int main() {
auto start = high_resolution_clock::now();
int low = 0, high = 10, i;
bool isPrime = true;
list<int> mylist;
list<int>::iterator it;
it = mylist.begin();for (int low = 0; low < 2000000; low++) {
if (low <= 1) {
isPrime = false;
continue;
}
else if (low == 3) {isPrime = true;
mylist.insert(it, low);
continue;
}else if(low % 2 == 0 or low % 3 == 0) {
isPrime = false;
continue;
}int iy = 5;
for (int ii = iy; ii*ii <= low; ii++) {
if (low % ii == 0 or low % (ii + 2) == 0) {isPrime = false;
continue;
}
ii = ii + 6;
}
isPrime = true;
mylist.insert(it, low);
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<seconds>(stop - start);
std::cout << "Time taken by function: "<< duration.count() << " seconds" << endl;return 0;}

cuda file:

Note: this file is just an automatically generated file from VS’19 with minor adjustments for a prime numbers generator. In later parts there will be more implementations.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <list>
#include <stdio.h>
#include <string>
#include <iostream>
#include <chrono>
using namespace std::chrono;
using namespace std;
#define HIGH 200000
#define MAX 18000
cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size, const int h, int* p);__global__ void addKernel(int* c, const int* a, const int* b, const int* h, int* p)
{
int low = 0, ii;
const int high = *h;
bool isPrime = true;
int pi = 0;
for (int low = 0; low < high; low++) {
if (low <= 1) {
isPrime = false;
continue;
}
else if (low == 3) {
isPrime = true;
p[pi] = low;
pi = pi + 1;
continue;
}
else if (low % 2 == 0 | low % 3 == 0) {
isPrime = false;
continue;
}
int iy = 5;for (int ii = iy; ii * ii <= low; ii++) {if (low % ii == 0 | low % (ii + 2) == 0) {
isPrime = false;
continue;
}
ii = ii + 6;}
isPrime = true;
p[pi] = low;pi = pi + 1;}}int main(){auto start = high_resolution_clock::now();const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
const int high = HIGH;
int p_main[MAX] = { 0 };
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, high, p_main);if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;}cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}auto stop = high_resolution_clock::now();auto duration = duration_cast<microseconds>(stop - start);std::cout << "Time taken by function: "<< duration.count() << " seconds" << endl;return 0;}cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size, const int h, int* p){int* dev_a = 0;
int* dev_b = 0;
int* dev_c = 0;
int* hh = 0;
int* pp = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&pp, MAX * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&hh, sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(hh, &h, sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.addKernel << <1, 1>> > (dev_c, dev_a, dev_b, hh, pp);cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);goto Error;}cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");goto Error;}cudaStatus = cudaMemcpy(p, pp, MAX * sizeof(int), cudaMemcpyDeviceToHost);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

Summary

python: 3.74 secs

c++: 0.74 secs

cuda: 0.94 secs

Comment

python: very nice in developing, but it’s interpreted that makes it slower. Actually most of the high computing platforms supports interfaces in python, which makes it ALMOST as fast as CUDA and c++.

c++: unquestionable leader in performance since many years. Natural choice for gaming and vision. A lot harder to build a roboust solution due to it’s complex syntax.

cuda: super efficient in terms of parallel computations — as we saw on the score it does no add for a single thread computations. It requires deep expertise!

Take care! Part 2 soon!

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store