Files
2024-07-19 13:30:31 +03:00

359 lines
17 KiB
C

#ifndef CONV2D_OPT_H
#define CONV2D_OPT_H
void conv2_8bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][fil_dim[3] << 2], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, m, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0] - strides;
for (j = 0; j < out_dim[0]; j++) { // output height
str1 += strides;
str2 = -pad[2] - strides;
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
for (m = 0; m < fil_dim[3]; m++) { // filters depth
in_cnn = inp[k1][k2][m];
w = fil[i][p][n][4*m];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][4*m+1];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][4*m+2];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][4*m+3];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void conv2_8bits_1ch(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][fil_dim[3] << 2], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0] - strides;
for (j = 0; j < out_dim[0]; j++) { // output height
str1 += strides;
str2 = -pad[2] - strides;
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
in_cnn = inp[k1][k2][0];
w = fil[i][p][n][0];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][1];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][2];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][3];
asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void conv2_4bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][fil_dim[3] << 1], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, m, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0];
for (j = 0; j < out_dim[0]; j++) { // output height
if (j != 0) str1 += strides;
str2 = -pad[2];
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
if (k != 0) str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
for (m = 0; m < fil_dim[3]; m++) { // filters depth
in_cnn = inp[k1][k2][m];
w = fil[i][p][n][2*m];
asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][2*m+1];
asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void conv2_4bits_1ch(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][fil_dim[3] << 1], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0] - strides;
for (j = 0; j < out_dim[0]; j++) { // output height
str1 += strides;
str2 = -pad[2] - strides;
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
in_cnn = inp[k1][k2][0];
w = fil[i][p][n][0];
asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
w = fil[i][p][n][1];
asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void conv2_2bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][fil_dim[3]], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, m, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0] - strides;
for (j = 0; j < out_dim[0]; j++) { // output height
str1 += strides;
str2 = -pad[2] - strides;
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
for (m = 0; m < fil_dim[3]; m++) { // filters depth
in_cnn = inp[k1][k2][m];
w = fil[i][p][n][m];
asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void conv2_2bits_1ch(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][1], const int fil[fil_dim[0]][fil_dim[1]][fil_dim[2]][1], const int bias[fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[4], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
int i, j, k, n, p, res, k1, k2, str1, str2, w, in_cnn, bias_val;
for (i = 0; i < out_dim[2]; i++) { // output depth
str1 = -pad[0] -strides;
for (j = 0; j < out_dim[0]; j++) { // output height
str1 += strides;
str2 = -pad[2] - strides;
for (k = 0; k < out_dim[1]; k++) { // output width
bias_val = bias[i];
asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
str2 += strides;
for (p = 0; p < fil_dim[1]; p++) { // filters height
for (n = 0; n < fil_dim[2]; n++) { // filters width
k1 = str1 + p;
k2 = str2 + n;
if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) {
in_cnn = inp[k1][k2][0];
w = fil[i][p][n][0];
asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
}
}
}
asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
out[j][k][i] = res;
}
}
}
}
void maxpool2_compressed(int in_dim[3], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int pool_size, int strides) {
int i, j, m, n, d, k1, k2, str1, str2;
uint32_t value1, value2, value3, value4;
uint32_t max_value1, max_value2, max_value3, max_value4, c;
for (d = 0; d < out_dim[2]; d++) {
str1 = 0;
for (i = 0; i < out_dim[0]; i++) {
if (i != 0) str1 += strides;
str2 = 0;
for (j = 0; j < out_dim[1]; j++) {
if (j != 0) str2 += strides;
max_value1 = 0;
max_value2 = 0;
max_value3 = 0;
max_value4 = 0;
for (m = 0; m < pool_size; m++) {
for (n = 0; n < pool_size; n++) {
k1 = str1 + m;
k2 = str2 + n;
if (k1 >= 0 && k2 >=0 && k1 < in_dim[0] && k2 < in_dim[1]){
value1 = inp[k1][k2][d] & 0xFF000000;
value2 = inp[k1][k2][d] & 0x00FF0000;
value3 = inp[k1][k2][d] & 0x0000FF00;
value4 = inp[k1][k2][d] & 0x000000FF;
if (value1 > max_value1) {
max_value1 = value1;
}
if (value2 > max_value2) {
max_value2 = value2;
}
if (value3 > max_value3) {
max_value3 = value3;
}
if (value4 > max_value4) {
max_value4 = value4;
}
}
}
}
c = max_value1 | max_value2 | max_value3 | max_value4;
out[i][j][d] = c;
}
}
}
}
void avgpool2_compressed(int in_dim[3], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], int out[out_dim[0]][out_dim[1]][out_dim[2]], int pool_size, int strides) {
int i, j, m, n, d, k1, k2, str1, str2;
int avg_value1, avg_value2, avg_value3, avg_value4;
for (d = 0; d < out_dim[2]; d++) {
str1 = -strides;
for (i = 0; i < out_dim[0]; i++) {
str1 += strides;
str2 = -strides;
for (j = 0; j < out_dim[1]; j++) {
str2 += strides;
avg_value1 = 0;
avg_value2 = 0;
avg_value3 = 0;
avg_value4 = 0;
for (m = 0; m < pool_size; m++) {
for (n = 0; n < pool_size; n++) {
k1 = str1 + m;
k2 = str2 + n;
if (k1 >= 0 && k2 >=0 && k1 < in_dim[0] && k2 < in_dim[1]){
avg_value1 += ((inp[k1][k2][d] & 0xFF000000) >> 24);
avg_value2 += ((inp[k1][k2][d] & 0x00FF0000) >> 16);
avg_value3 += ((inp[k1][k2][d] & 0x0000FF00) >> 8);
avg_value4 += (inp[k1][k2][d] & 0x000000FF);
}
}
}
avg_value1 = avg_value1 / (pool_size * pool_size);
avg_value2 = avg_value2 / (pool_size * pool_size);
avg_value3 = avg_value3 / (pool_size * pool_size);
avg_value4 = avg_value4 / (pool_size * pool_size);
out[i][j][d] = ((avg_value1 << 24) | (avg_value2 << 16) | (avg_value3 << 8) | (avg_value4)) ;
}
}
}
}
void flatten(int in_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], int out[]){
int index = 0;
int values[in_dim[0]][in_dim[1]][in_dim[2] << 2];
for (int i = 0; i < in_dim[0]; i++){
for(int j = 0; j < in_dim[1]; j++){
for(int k = 0; k < in_dim[2]; k++){
values[i][j][4*k] = (inp[i][j][k] & 0xFF000000) >> 24;
values[i][j][4*k+1] = (inp[i][j][k] & 0x00FF0000) >> 16;
values[i][j][4*k+2] = (inp[i][j][k] & 0x0000FF00) >> 8;
values[i][j][4*k+3] = inp[i][j][k] & 0x000000FF;
}
}
}
int out_dim = (in_dim[0] * in_dim[1] * in_dim[2]) << 2;
int flatten_matrix[out_dim];
for (int k = 0; k < in_dim[2] << 2; k++){
for(int j = 0; j < in_dim[0]; j++){
for(int i = 0; i < in_dim[1]; i++){
flatten_matrix[index++] = values[j][i][k];
}
}
}
for(int i = 0; i < out_dim >> 2; i++){
out[i] = (flatten_matrix[4*i] << 24 | flatten_matrix[4*i+1] << 16 | flatten_matrix[4*i+2] << 8 | flatten_matrix[4*i+3]);
}
}
#endif /* CONV2D_OPT_H */