I have 4 Rocket RISC-V cores and each are connected to 4 identical accelerators. What I am trying to do is to use multithreading to run all or some of the accelerators at the same time and compare the performance results among them. I need to measure the time between when the first started thread begins its job until the last started thread finishes its task using csr counters. The task for all threads is the same and they are not dependent on others.
I am struggling measuring the correct time, however, and see that threads start one after other instead of being concurrent (I don't expect them to start at exactly the same time but I need to see some overlap).
What I see for example for two threads on one CPUs for 10 tests:
Thread 0 started at 12612628151 and ended at 12621280072
Thread 1 started at 16418324070 and ended at 16426815938
or two threads on two CPUs:
Thread 0 started at 12599644918 and ended at 12608560166
Thread 1 started at 15271446547 and ended at 15280014314
// expecting Thread 1 starts at 12599900000 for example
How can I measure the time properly? Here is my code:
#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include <time.h>
#include "encoding.h"
#include "rocc.h"
#include "data/tests.h"
//----------------------------------------------------------------\\
//------------------ UTILITY-------------------------------------\\
//----------------------------------------------------------------\\
uint64_t freq = 25000000; // 25 MHz
volatile uint64_t total_tasks = 0;
volatile double total_time = 0.0;
uint64_t total_clk_cycles = 0;
uint64_t total_insts = 0;
double inst_per_cycles = 0.0;
void print_measurments(double total_time) {
double tasks_per_second = total_tasks / total_time;
printf("\nTotal number of instructions: %lu \n", total_insts);
printf("Total number of cycles: %lu \n", total_clk_cycles);
printf("Total number of tasks finished: %lu \n", total_tasks);
printf("Tasks per second with gettime(): %.5f\n", tasks_per_second);
printf("Tasks per second with csr_cycles(): %f\n", (double)total_tasks / ((double)total_clk_cycles / (double)freq));
}
uint64_t concatenate_ints(unsigned int x, unsigned int y) {
return ((uint64_t)x << 32) | y;
}
uint64_t concatenate_3ints(uint64_t x, uint64_t y, uint64_t z) {
return ((uint64_t)x << 48) | (((uint64_t)y << 32) | z);
}
uint64_t concatenate_arrays(int64_t* x, int64_t* y) {
return ((uint64_t)x << 32) | (uint64_t)y;
}
//################################################################\\
//----------------------------------------------------------------\\
//------------------ THREAD -------------------------------------\\
//----------------------------------------------------------------\\
#define handle_error_en(en, msg) \
do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
pthread_mutex_t timing_mutex = PTHREAD_MUTEX_INITIALIZER;
struct timespec start_time, end_time;
struct thread_info {
pthread_t thread_id;
int thread_num;
int res;
int cpumask;
int accelid;
test_struct* tests;
size_t num_tests;
uint64_t start_time;
uint64_t end_time;
};
int run(struct thread_info *tinfo);
static void *thread_start(void *arg) {
struct thread_info *tinfo = arg;
tinfo->res = run(tinfo);
return (void *) &(tinfo->res);
}
void initialize_threads(struct thread_info *tinfo, int nthreads, char *argv[], test_struct* tests, size_t num_tests) {
pthread_attr_t attr;
int s;
s = pthread_attr_init(&attr);
if (s != 0) {
handle_error_en(s, "pthread_attr_init");
}
size_t size = PTHREAD_STACK_MIN + 0x1000000;
s = pthread_attr_setstacksize(&attr, size);
if (s != 0) {
handle_error_en(s, "pthread_attr_setstacksize");
}
for (int tnum = 0; tnum < nthreads; tnum++) {
tinfo[tnum].thread_num = tnum;
tinfo[tnum].cpumask = atoi(argv[tnum + 2]);
tinfo[tnum].accelid = tnum; // Match accelid with thread_num
tinfo[tnum].tests = tests;
tinfo[tnum].num_tests = num_tests;
s = pthread_create(&tinfo[tnum].thread_id, &attr, &thread_start, &tinfo[tnum]);
if (s != 0) {
handle_error_en(s, "pthread_create");
}
}
s = pthread_attr_destroy(&attr);
if (s != 0) {
handle_error_en(s, "pthread_attr_destroy");
}
}
void join_threads(struct thread_info *tinfo, int nthreads) {
int s;
void *tres;
for (int tnum = 0; tnum < nthreads; tnum++) {
s = pthread_join(tinfo[tnum].thread_id, &tres);
if (s != 0) {
handle_error_en(s, "pthread_join");
}
}
}
void set_thread_affinity(pthread_t thread, int cpumask) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
for (int i = 0; i < 3; ++i) {
if (cpumask & (1 << i)) {
CPU_SET(i, &cpuset);
}
}
int result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
if (result != 0) {
perror("pthread_setaffinity_np");
}
}
void record_start_time(struct thread_info *tinfo) {
tinfo->start_time = rdcycle();
}
void record_end_time(struct thread_info *tinfo) {
tinfo->end_time = rdcycle();
pthread_mutex_lock(&timing_mutex);
//total_clk_cycles += (tinfo->end_time - tinfo->start_time);
pthread_mutex_unlock(&timing_mutex);
}
uint64_t th0_start_cycle = 0, th0_end_cycle = 0;
uint64_t th1_start_cycle = 0, th1_end_cycle = 0;
uint64_t th2_start_cycle = 0, th2_end_cycle = 0;
uint64_t th3_start_cycle = 0, th3_end_cycle = 0;
uint64_t max4(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
uint64_t max_value = 0;
if (a != 0) {
max_value = a;
}
if (b != 0 && b > max_value) {
max_value = b;
}
if (c != 0 && c > max_value) {
max_value = c;
}
if (d != 0 && d > max_value) {
max_value = d;
}
return max_value;
}
uint64_t min4(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
uint64_t min_value = UINT64_MAX;
if (a != 0) {
min_value = a;
}
if (b != 0 && b < min_value) {
min_value = b;
}
if (c != 0 && c < min_value) {
min_value = c;
}
if (d != 0 && d < min_value) {
min_value = d;
}
return min_value;
}
int run(struct thread_info *tinfo) {
int threadid = tinfo->thread_num;
int cpumask = tinfo->cpumask;
int accelid = tinfo->accelid;
test_struct *tests = tinfo->tests;
size_t num_tests = tinfo->num_tests;
// Get the current thread ID and set the affinity
pthread_t thread = pthread_self();
set_thread_affinity(thread, cpumask);
int64_t task_done = 0;
int num_test_passed = 0;
uint64_t *data1 = (uint64_t *)malloc(sizeof(uint64_t) * tests[0].data1_packs);
uint64_t *data2 = (uint64_t *)malloc(sizeof(uint64_t) * tests[0].data2_packs);
printf("[thread %d] Starting tests\n", threadid);
record_start_time(tinfo);
for (size_t i = 0; i < num_tests; i++) {
uint64_t local_param = ceil(((double)(tests[i].data2_len) / 11));
uint64_t data1_len_size = concatenate_ints(tests[i].data1_len, tests[i].data1_packs);
uint64_t data2_len_size_var = concatenate_3ints(tests[i].data2_len, tests[i].data2_packs, local_param);
if (tests[i].data1 != (i > 0 ? tests[i - 1].data1 : NULL)) {
for (size_t j = 0; j < tests[i].data1_packs; j++) {
data1[j] = tests[i].data1[j];
}
switch (accelid) {
case 0:
ROCC_INSTRUCTION_SS(0, data1, data1_len_size, 0);
break;
case 1:
ROCC_INSTRUCTION_SS(1, data1, data1_len_size, 0);
break;
case 2:
ROCC_INSTRUCTION_SS(2, data1, data1_len_size, 0);
break;
case 3:
ROCC_INSTRUCTION_SS(3, data1, data1_len_size, 0);
break;
default:
break;
}
}
for (size_t j = 0; j < tests[i].data2_packs; j++) {
data2[j] = tests[i].data2[j];
}
switch (accelid) {
case 0:
th0_start_cycle = rdcycle();
ROCC_INSTRUCTION_DSS(0, task_done, data2, data2_len_size_var, 1);
th0_end_cycle = rdcycle();
total_tasks++;
break;
case 1:
th1_start_cycle = rdcycle();
ROCC_INSTRUCTION_DSS(1, task_done, data2, data2_len_size_var, 1);
th1_end_cycle = rdcycle();
total_tasks++;
break;
case 2:
th2_start_cycle = rdcycle();
ROCC_INSTRUCTION_DSS(2, task_done, data2, data2_len_size_var, 1);
th2_end_cycle = rdcycle();
total_tasks++;
break;
case 3:
th3_start_cycle = rdcycle();
ROCC_INSTRUCTION_DSS(3, task_done, data2, data2_len_size_var, 1);
th3_end_cycle = rdcycle();
total_tasks++;
break;
default:
break;
}
}
uint64_t th_start_time = min4(th0_start_cycle, th1_start_cycle, th2_start_cycle, th3_start_cycle);
uint64_t th_end_time = max4(th0_end_cycle, th1_end_cycle, th2_end_cycle, th3_end_cycle);
printf( "th%d start = %ld, th1 start = %ld, th2 start = %ld, th2 start = %ld => min = %ld \n", threadid, th0_start_cycle, th1_start_cycle, th2_start_cycle, th3_start_cycle, th_start_time);
printf( "th%d end = %ld, th1 end = %ld, th2 end = %ld, th2 end = %ld => max = %ld \n", threadid, th0_end_cycle, th1_end_cycle, th2_end_cycle, th3_end_cycle, th_end_time);
total_clk_cycles += (th_end_time - th_start_time);
record_end_time(tinfo);
free(data2);
free(data1);
printf("\n[thread %d] Tests finished.\n", threadid, num_test_passed);
return 0;
}
/*---------------------------------------------------------------*/
/*---------------------MAIN FUNCTION-----------------------------*/
/*---------------------------------------------------------------*/
int main(int argc, char *argv[]) {
static int num_tests, num_threads;
printf("start\n");
if (argc < 2) {
printf("Usage: %s <num_tests> <num_threads> <cpumask1> <cpumask2> ... <accelid1> <accelid2> ...\n", argv[0]);
exit(EXIT_FAILURE);
}
num_tests = atoi(argv[1]);
num_threads = atoi(argv[2]);
printf("num_tests = %d & num_threads = %d \n", num_tests, num_threads);
struct thread_info *tinfo = calloc(num_threads, sizeof(*tinfo));
if (tinfo == NULL) {
handle_error("calloc");
}
clock_gettime(CLOCK_MONOTONIC, &start_time);
initialize_threads(tinfo, num_threads, argv, tests, num_tests);
join_threads(tinfo, num_threads);
clock_gettime(CLOCK_MONOTONIC, &end_time);
total_time = (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_nsec - start_time.tv_nsec) / 1e9;
print_measurments(total_time);
// Print the start and end times of each thread
for (int i = 0; i < num_threads; i++) {
printf("Thread %d started at %lu and ended at %lu\n", tinfo[i].thread_num, tinfo[i].start_time, tinfo[i].end_time);
}
free(tinfo);
return EXIT_SUCCESS;
}
And the prints inside run() function shows (again for 10 tests):
[thread 0] Starting tests
[thread 1] Starting tests
th0 start = 12620313755, th1 start = 16425843909, th2 start = 0, th2 start = 0 => min = 12620313755
th0 end = 12621128480, th1 end = 16425836178, th2 end = 0, th2 end = 0 => max = 16425836178
th1 start = 12620313755, th1 start = 16425843909, th2 start = 0, th2 start = 0 => min = 12620313755
th1 end = 12621128480, th1 end = 16426650923, th2 end = 0, th2 end = 0 => max = 16426650923
[thread 0] Tests finished.
[thread 1] Tests finished.
The theoretical number for number of cycles for each task is about 80,000 for one test. For 10 tests it should be 800,000 cycles. So for example for thread0 above if I subtract the end cylces(12621128480) and start cycles (12620313755) I get 814725 which is close to 800,000.
Ultimately I want to run 16 threads at (almost) the same time on 4 cores for 16 accelerators and measure the time between the start time of the 1'st thread and end time of 16'th thread. Also just to note, the number of tasks being done in this case will be 16 time more (with 10 tests, tasks finished = 160).
Please let me know if you need more information.
Update (tests.h):
uint64_t data2[] = {
0x5e54c42e48f95681,
0x6817f0ac737ce56c,
0x2c25c96e2f08878e,
0x76e63b8e74593080,
0x5495a7174a9738cb,
0x766b718558de643e,
0x112af5be56f0419d,
0x39dade335211fa50,
0x2da48dcf3d051c37,
0x0c7b1f97089158d8,
0x51c71c37647dd215,
0x193c2e614ee73ee9,
0x6369f4c63ea7bf2d,
0x1340c33d264fad65,
0x705d49c404b1a154,
0x266694b14eb20df2,
0x4daaf7d50e7e855d,
0x422ef35f79d0c143,
0x3d870ceb39152eed,
0x6e29f1c3121cb402,
0x03ac67b864956348,
0x6afb184014d75d76,
0x3b85a4e524d5f673,
0x66e957c6692a32b5,
0x61db12aa7364775d,
0x71bb8b8d33a22ee2,
0x57e249720af7b9ee,
0x02896dcb3b4c3e39,
0x499f791b15ca3108,
0x619beb9e39fcc2df,
0x1a7bd25c0802804f,
0x08aed0d26826ca32
};
uint64_t data1[] =
{
0xBBE6139D23887ACF,
0xA5C3E51BDB1C7101,
0xE5E8E8377D210A02,
0xCE1D0EB443C754E8,
0x52FE52BE649D946B,
0x841A847EABE22232,
0xFB6DD72788419F4A,
0xDAB563352538E68C,
0xCD809B2A690DD264,
0x767A3E544BD27E8,
0xC43B11A46691E3F1,
0x9D709F1788087B54,
0x233AC62E6F14D8F0,
0x9B31EFD15B268F5E,
0x911AA32FFC2C5485,
0x52E9C7B33B6B1A8D,
0xE590F5241B3E896A,
0x8F15C4A2891F1BD0,
0xA1831064E81917BF,
0xA66EE7901ADE7A05,
0x499600463D2E2DBE,
0x8FBAFAC1084EA740,
0x7A3CDBD78AA1F6F8,
0x229EB0F8F889B710,
0xB4E48DBDF6F101E1,
0x1DC8989C5D226B6C,
0xBC49E8E6570BDB0D,
0x45446206D4087C,
0xA945EA2DA761AC00,
0xC544955BD883EAD7,
0x2A3772AFA29009F1,
0xA28C1A9DD2648D10,
0xDEC2D76F0EEB931C
};
//structure for the test
typedef struct {
uint64_t *data1;
uint64_t *data2;
int data1_packs;
int data2_packs;
int data1_len;
int data2_len;
} test_struct;
// 6 tests
test_struct tests[] = {
{data1, data2, 1566, 32, 50000, 1000},
{data1, data2, 1566, 32, 50000, 1000},
{data1, data2, 1566, 32, 50000, 1000},
{data1, data2, 1566, 32, 50000, 1000},
{data1, data2, 1566, 32, 50000, 1000},
{data1, data2, 1566, 32, 50000, 1000}
};
Multicore Test:
I used the script below to use taskset the run the threaded program on 4 CPUs:
#!/bin/bash
gcc -o fix1 fix1.c -pthread -lm
for i in 0 1 2 3; do
(taskset -c $i ./fix1 -g 300 4 2>&1 | sed "s/^/CPU$i: /") &
done
wait
And the output is (I am only showing the 7 most right digits for end and start time, and min and max times for each cpu are the starting time of the first started thread and end time of the last thread finished, respectively):
CPU0: start
CPU0: t=1 g=1
CPU0: num_tests = 300 & num_threads = 4
CPU0: tests = 2500 elements
CPU0: d1cnt=1566 d2cnt=32
CPU0: T1 start 4403660 end 4526744 ELAPSED: 123084
CPU0: T2 start 4598096 end 4684928 ELAPSED: 86832
CPU0: T3 start 4717724 end 4804556 ELAPSED: 86832
CPU0: T0 start 4880624 end 4967708 ELAPSED: 87084
CPU0: Minimum start time: 3602931164403660
CPU0: Maximum end time: 3602931164967708
CPU0: Time difference: 564048
CPU0:
CPU0: Total number of instructions: 0
CPU0: Total number of cycles: 564048
CPU0: Total number of Tasks finished: 1204
CPU0: Tasks per second with gettime(): 4344359.00859
CPU0: Tasks per second with csr_cycles(): 53364.252688
CPU2: start
CPU2: t=1 g=1
CPU2: num_tests = 300 & num_threads = 4
CPU2: tests = 2500 elements
CPU2: d1cnt=1566 d2cnt=32
CPU2: T1 start 4659980 end 4770068 ELAPSED: 110088
CPU2: T2 start 4854488 end 4941212 ELAPSED: 86724
CPU2: T3 start 4977392 end 5063900 ELAPSED: 86508
CPU2: T0 start 5122508 end 5209736 ELAPSED: 87228
CPU2: Minimum start time: 3602931164659980
CPU2: Maximum end time: 3602931165209736
CPU2: Time difference: 549756
CPU2:
CPU2: Total number of instructions: 0
CPU2: Total number of cycles: 549756
CPU2: Total number of Tasks finished: 1204
CPU2: Tasks per second with gettime(): 4423364.47568
CPU2: Tasks per second with csr_cycles(): 54751.562511
CPU3: start
CPU3: t=1 g=1
CPU3: num_tests = 300 & num_threads = 4
CPU3: tests = 2500 elements
CPU3: d1cnt=1566 d2cnt=32
CPU3: T1 start 4951220 end 5055224 ELAPSED: 104004
CPU3: T2 start 5119988 end 5206748 ELAPSED: 86760
CPU3: T3 start 5239832 end 5326556 ELAPSED: 86724
CPU3: T0 start 5390636 end 5477720 ELAPSED: 87084
CPU3: Minimum start time: 3602931164951220
CPU3: Maximum end time: 3602931165477720
CPU3: Time difference: 526500
CPU3:
CPU3: Total number of instructions: 0
CPU3: Total number of cycles: 526500
CPU3: Total number of Tasks finished: 1204
CPU3: Tasks per second with gettime(): 4134033.32635
CPU3: Tasks per second with csr_cycles(): 57169.990503
CPU1: start
CPU1: t=1 g=1
CPU1: num_tests = 300 & num_threads = 4
CPU1: tests = 2500 elements
CPU1: d1cnt=1566 d2cnt=32
CPU1: T1 start 991928 end 1097804 ELAPSED: 105876
CPU1: T2 start 1170452 end 1257428 ELAPSED: 86976
CPU1: T3 start 1297820 end 1384400 ELAPSED: 86580
CPU1: T0 start 1448336 end 1535456 ELAPSED: 87120
CPU1: Minimum start time: 3602931190991928
CPU1: Maximum end time: 3602931191535456
CPU1: Time difference: 543528
CPU1:
CPU1: Total number of instructions: 0
CPU1: Total number of cycles: 543528
CPU1: Total number of Tasks finished: 1204
CPU1: Tasks per second with gettime(): 4486845.04733
CPU1: Tasks per second with csr_cycles(): 55378.931720
.cfile: with all (e.g.)#include <stdio.h>but no#include "myprogram.h"(For those, copy&paste in the.hcontents.) Errors may be where you don't suspect, so post the full code (withmain). For runtime errors, it should compile cleanly. We'd like to inspect/build/run the code on our systems. We/you might: (1) Compile with-Walland-fsanitize=address -O0 -g(2) Run it undergdb, orstrace(3) Add debugprintf.hfiles so we can't try to run your program on our systems. Even commenting them out, then,ROCC*are undefined. And, your testing related code (e.g.test_struct) are also undefined.