Adding new features

2024-07-23 13:00:49 +03:00
parent 9e044fd7fc
commit 745cc4ed6d
28 changed files with 33632 additions and 106 deletions
@@ -0,0 +1,15 @@
 # Copyright lowRISC contributors.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Generate a baremetal application
 # Name of the program $(PROGRAM).c will be added as a source file
 PROGRAM = cifar10_dws_cnn
 PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 # Any extra source files to include in the build. Use the upper case .S
 # extension for assembly files
 EXTRA_SRCS :=
 include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,298 @@
 #include "simple_system_common.h"
 #include "cnn_weights.h"
 #include "fully_connected_opt.h"
 #include "ibex_cnn_params.h"
 #include "ibex_inputs.h"
 #include "conv2d_opt.h"
 #include "dws_conv_opt.h"
 #define IMG_SZ 32
 #define NUM_FIL0 1
 #define FILTER1 3
 #define FILTER2 1
 #define FILTER3 3
 #define FILTER4 1
 #define FILTER5 3
 #define FILTER6 1
 #define FILTER7 3
 #define FILTER8 1
 #define FILTER9 3
 #define FILTER10 1
 #define FILTER11 3
 #define FILTER12 1
 #define NUM_FIL1 1
 #define NUM_FIL2 16
 #define NUM_FIL3 16
 #define NUM_FIL4 16
 #define NUM_FIL5 16
 #define NUM_FIL6 32
 #define NUM_FIL7 32
 #define NUM_FIL8 32
 #define NUM_FIL9 32
 #define NUM_FIL10 64
 #define NUM_FIL11 64
 #define NUM_FIL12 64
 #define STRIDE1 1
 #define STRIDE2 1
 #define STRIDE3 1
 #define STRIDE4 1
 #define STRIDE5 1
 #define STRIDE6 1
 #define STRIDE7 1
 #define STRIDE8 1
 #define STRIDE9 1
 #define STRIDE10 1
 #define STRIDE11 1
 #define STRIDE12 1
 #define PAD_TB1 1
 #define PAD_LR1 1
 #define PAD_TB2 0
 #define PAD_LR2 0
 #define PAD_TB3 1
 #define PAD_LR3 1
 #define PAD_TB4 0
 #define PAD_LR4 0
 #define PAD_TB5 1
 #define PAD_LR5 1
 #define PAD_TB6 0
 #define PAD_LR6 0
 #define PAD_TB7 1
 #define PAD_LR7 1
 #define PAD_TB8 0
 #define PAD_LR8 0
 #define PAD_TB9 1
 #define PAD_LR9 1
 #define PAD_TB10 0
 #define PAD_LR10 0
 #define PAD_TB11 1
 #define PAD_LR11 1
 #define PAD_TB12 0
 #define PAD_LR12 0
 #define POOL_STRIDE1 2
 #define POOL_SIZE1 2
 #define POOL_STRIDE2 2
 #define POOL_SIZE2 2
 #define POOL_STRIDE3 2
 #define POOL_SIZE3 2
 #define OUT_DIM 3
 #define SAMPLES 1
 int outs[SAMPLES][OUT_DIM];
 void cifar10_dws_cnn() {
 	int dout1 = NUM_FIL1;
 	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
 	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
 	int dout2 = NUM_FIL2;
 	int hout2 = ((hout1 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
 	int wout2 = ((wout1 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
 	int dout3 = NUM_FIL3;
 	int hout3 = ((hout2 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
 	int wout3 = ((wout2 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
 	int dout4 = NUM_FIL4;
 	int hout4 = ((hout3 - FILTER4+ 2 * PAD_TB4)/STRIDE4)+1;
 	int wout4 = ((wout3 - FILTER4+ 2 * PAD_LR4)/STRIDE4)+1;
 	int dout5 = dout4;
 	int hout5 = hout4/POOL_STRIDE1;
 	int wout5 = wout4/POOL_STRIDE1;
 	int dout6 = NUM_FIL5;
 	int hout6 = ((hout5 - FILTER5+ 2 * PAD_TB5)/STRIDE5)+1;
 	int wout6 = ((wout5 - FILTER5+ 2 * PAD_LR5)/STRIDE5)+1;
 	int dout7 = NUM_FIL6;
 	int hout7 = ((hout6 - FILTER6+ 2 * PAD_TB6)/STRIDE6)+1;
 	int wout7 = ((wout6 - FILTER6+ 2 * PAD_LR6)/STRIDE6)+1;
 	int dout8 = NUM_FIL7;
 	int hout8 = ((hout7 - FILTER7+ 2 * PAD_TB7)/STRIDE7)+1;
 	int wout8 = ((wout7 - FILTER7+ 2 * PAD_LR7)/STRIDE7)+1;
 	int dout9 = NUM_FIL8;
 	int hout9 = ((hout8 - FILTER8+ 2 * PAD_TB8)/STRIDE8)+1;
 	int wout9 = ((wout8 - FILTER8+ 2 * PAD_LR8)/STRIDE8)+1;
 	int dout10 = dout9;
 	int hout10 = hout9/POOL_STRIDE2;
 	int wout10 = wout9/POOL_STRIDE2;
 	int dout11 = NUM_FIL9;
 	int hout11 = ((hout10 - FILTER9+ 2 * PAD_TB9)/STRIDE9)+1;
 	int wout11 = ((wout10 - FILTER9+ 2 * PAD_LR9)/STRIDE9)+1;
 	int dout12 = NUM_FIL10;
 	int hout12 = ((hout11 - FILTER10+ 2 * PAD_TB10)/STRIDE10)+1;
 	int wout12 = ((wout11 - FILTER10+ 2 * PAD_LR10)/STRIDE10)+1;
 	int dout13 = NUM_FIL11;
 	int hout13 = ((hout12 - FILTER11+ 2 * PAD_TB11)/STRIDE11)+1;
 	int wout13 = ((wout12 - FILTER11+ 2 * PAD_LR11)/STRIDE11)+1;
 	int dout14 = NUM_FIL12;
 	int hout14 = ((hout13 - FILTER12+ 2 * PAD_TB12)/STRIDE12)+1;
 	int wout14 = ((wout13 - FILTER12+ 2 * PAD_LR12)/STRIDE12)+1;
 	int dout15 = dout14;
 	int hout15 = hout14/POOL_STRIDE3;
 	int wout15 = wout14/POOL_STRIDE3;
 	int flatten_dim = dout15 * hout15 * wout15;
 	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
 	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
 	int out1[hout1][wout1][dout1];
 	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
 	int outp_dim1[3] = {hout1, wout1, dout1};
 	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
 	int out2[hout2][wout2][dout2];
 	int pad_2[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
 	int outp_dim2[3] = {hout2, wout2, dout2};
 	int f_dim2[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
 	int out3[hout3][wout3][dout3];
 	int pad_3[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
 	int outp_dim3[3] = {hout3, wout3, dout3};
 	int f_dim3[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
 	int out4[hout4][wout4][dout4];
 	int pad_4[4] = {PAD_TB4, PAD_TB4, PAD_LR4, PAD_LR4};
 	int outp_dim4[3] = {hout4, wout4, dout4};
 	int f_dim4[4] = {NUM_FIL4, FILTER4, FILTER4, NUM_FIL3};
 	int out5[hout5][wout5][dout5];
 	int outp_dim5[3] = {hout5, wout5, dout5};
 	int out6[hout6][wout6][dout6];
 	int pad_6[4] = {PAD_TB5, PAD_TB5, PAD_LR5, PAD_LR5};
 	int outp_dim6[3] = {hout6, wout6, dout6};
 	int f_dim6[4] = {NUM_FIL5, FILTER5, FILTER5, NUM_FIL4};
 	int out7[hout7][wout7][dout7];
 	int pad_7[4] = {PAD_TB6, PAD_TB6, PAD_LR6, PAD_LR6};
 	int outp_dim7[3] = {hout7, wout7, dout7};
 	int f_dim7[4] = {NUM_FIL6, FILTER6, FILTER6, NUM_FIL5};
 	int out8[hout8][wout8][dout8];
 	int pad_8[4] = {PAD_TB7, PAD_TB7, PAD_LR7, PAD_LR7};
 	int outp_dim8[3] = {hout8, wout8, dout8};
 	int f_dim8[4] = {NUM_FIL7, FILTER7, FILTER7, NUM_FIL6};
 	int out9[hout9][wout9][dout9];
 	int pad_9[4] = {PAD_TB8, PAD_TB8, PAD_LR8, PAD_LR8};
 	int outp_dim9[3] = {hout9, wout9, dout9};
 	int f_dim9[4] = {NUM_FIL8, FILTER8, FILTER8, NUM_FIL7};
 	int out10[hout10][wout10][dout10];
 	int outp_dim10[3] = {hout10, wout10, dout10};
 	int out11[hout11][wout11][dout11];
 	int pad_11[4] = {PAD_TB9, PAD_TB9, PAD_LR9, PAD_LR9};
 	int outp_dim11[3] = {hout11, wout11, dout11};
 	int f_dim11[4] = {NUM_FIL9, FILTER9, FILTER9, NUM_FIL8};
 	int out12[hout12][wout12][dout12];
 	int pad_12[4] = {PAD_TB10, PAD_TB10, PAD_LR10, PAD_LR10};
 	int outp_dim12[3] = {hout12, wout12, dout12};
 	int f_dim12[4] = {NUM_FIL10, FILTER10, FILTER10, NUM_FIL9};
 	int out13[hout13][wout13][dout13];
 	int pad_13[4] = {PAD_TB11, PAD_TB11, PAD_LR11, PAD_LR11};
 	int outp_dim13[3] = {hout13, wout13, dout13};
 	int f_dim13[4] = {NUM_FIL11, FILTER11, FILTER11, NUM_FIL10};
 	int out14[hout14][wout14][dout14];
 	int pad_14[4] = {PAD_TB12, PAD_TB12, PAD_LR12, PAD_LR12};
 	int outp_dim14[3] = {hout14, wout14, dout14};
 	int f_dim14[4] = {NUM_FIL12, FILTER12, FILTER12, NUM_FIL11};
 	int out15[hout15][wout15][dout15];
 	int outp_dim15[3] = {hout15, wout15, dout15};
 	int out16[flatten_dim];
 	int out[OUT_DIM];
 	for (int iter = 0; iter < SAMPLES; iter++){
 		for(int i = 0; i < IMG_SZ; i++){
 			for(int j = 0; j < IMG_SZ; j++){
 				for(int k = 0; k < NUM_FIL0; k++){
 					in[i][j][k] = input[i][j][k][iter];
 				}
 			}
 		}
 		pcount_enable(1);
 		dw_conv_opt_1ch(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
 		pw_conv_2bits(outp_dim1, f_dim2, outp_dim2, out1, F2, B2, out2, STRIDE2, pad_2, SB2, MV2, SV2);
 		dw_conv_opt(outp_dim2, f_dim3, outp_dim3, out2, F3, B3, out3, STRIDE3, pad_3, SB3, MV3, SV3);
 		pw_conv_8bits(outp_dim3, f_dim4, outp_dim4, out3, F4, B4, out4, STRIDE4, pad_4, SB4, MV4, SV4);
 		maxpool2_compressed(outp_dim4, outp_dim5, out4, out5, POOL_SIZE1, POOL_STRIDE1);
 		dw_conv_opt(outp_dim5, f_dim6, outp_dim6, out5, F5, B5, out6, STRIDE5, pad_6, SB5, MV5, SV5);
 		pw_conv_2bits(outp_dim6, f_dim7, outp_dim7, out6, F6, B6, out7, STRIDE6, pad_7, SB6, MV6, SV6);
 		dw_conv_opt(outp_dim7, f_dim8, outp_dim8, out7, F7, B7, out8, STRIDE7, pad_8, SB7, MV7, SV7);
 		pw_conv_8bits(outp_dim8, f_dim9, outp_dim9, out8, F8, B8, out9, STRIDE8, pad_9, SB8, MV8, SV8);
 		maxpool2_compressed(outp_dim9, outp_dim10, out9, out10, POOL_SIZE2, POOL_STRIDE2);
 		dw_conv_opt(outp_dim10, f_dim11, outp_dim11, out10, F9, B9, out11, STRIDE9, pad_11, SB9, MV9, SV9);
 		pw_conv_8bits(outp_dim11, f_dim12, outp_dim12, out11, F10, B10, out12, STRIDE10, pad_12, SB10, MV10, SV10);
 		dw_conv_opt(outp_dim12, f_dim13, outp_dim13, out12, F11, B11, out13, STRIDE11, pad_13, SB11, MV11, SV11);
 		pw_conv_8bits(outp_dim13, f_dim14, outp_dim14, out13, F12, B12, out14, STRIDE12, pad_14, SB12, MV12, SV12);
 		maxpool2_compressed(outp_dim14, outp_dim15, out14, out15, POOL_SIZE3, POOL_STRIDE3);
 		flatten(outp_dim15, out15, out16);
 		mlp_layer_8bits(out16, out, flatten_dim, OUT_DIM, W1, B13, SB13, MV13, SV13);
 		pcount_enable(0);
 		puts("Output Layer Values:\n");
 		for(int i = 0; i < OUT_DIM; i++) {
 			puthex((out[i] & 0xFF000000) >> 24);
 			puts(" ");
 			puthex((out[i] & 0xFF0000) >> 16);
 			puts(" ");
 			puthex((out[i] & 0xFF00) >> 8);
 			puts(" ");
 			puthex(out[i] & 0xFF);
 			puts("\n");
 		}
 	}
 }
 int main(void) {
 	pcount_enable(0);
 	cifar10_dws_cnn();
 	return 0;
 }
@@ -0,0 +1,84 @@
 #ifndef IBEX_CNN_PARAMS_H
 #define IBEX_CNN_PARAMS_H
 #define MV1 1263225675
 #define MV2 1886417008
 #define MV3 1381126738
 #define MV4 1263225675
 #define MV5 1465341783
 #define MV6 1280068684
 #define MV7 1869573999
 #define MV8 1600085855
 #define MV9 1600085855
 #define MV10 1970632053
 #define MV11 1145324612
 #define MV12 1532713819
 #define MV13 1296911693
 #define SV1 2029118401
 #define SV2 946921921
 #define SV3 2029118401
 #define SV4 1893843841
 #define SV5 1893843841
 #define SV6 1082196481
 #define SV7 2029118401
 #define SV8 2029118401
 #define SV9 2029118401
 #define SV10 2164392961
 #define SV11 2029118401
 #define SV12 2029118401
 #define SV13 2840765761
 static const int SB1[1] = {
 	1
 };
 static const int SB2[16] = {
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 };
 static const int SB3[16] = {
 	135266305, 1048577, 1, 8257, 8193, 135274497, 135266369, 8193, 1, 8193, 65, 1, 134217729, 1, 1, 134225921
 };
 static const int SB4[16] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1048576, 0, 0, 0, 0
 };
 static const int SB5[16] = {
 	134234177, 136323073, 135282689, 136331393, 270549121, 136331329, 136331329, 136323201, 270540929, 270549121, 270540801, 270532737, 2105473, 8321, 2105345, 2113601
 };
 static const int SB6[32] = {
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 };
 static const int SB7[32] = {
 	402669825, 540041217, 537927937, 4194561, 272638209, 537952513, 540049665, 541098049, 404783361, 405831873, 2113793, 536879361, 403726593, 540049665, 271614209, 541089921, 272662721, 271614209, 406880513, 541081793, 272662785, 538992897, 272662785, 403726593, 540033281, 540049601, 1081537, 403726337, 269517057, 272646401, 3178625, 539001089
 };
 static const int SB8[32] = {
 	270565504, 536895744, 406880512, 541090048, 268435712, 406864128, 540049408, 541065216, 406872320, 541090048, 540049600, 405823552, 540041216, 4227264, 540049664, 271589632, 537952320, 4219008, 540033216, 540041408, 541090048, 540049600, 405823552, 405823680, 405823744, 541081856, 406880448, 402677888, 271606016, 138445056, 403726400, 405831680
 };
 static const int SB9[32] = {
 	677380417, 542146817, 806404417, 677421249, 677429569, 810598721, 677421185, 677429441, 408977665, 675332353, 536903937, 675283329, 675307905, 677429505, 811639105, 811639169, 809549953, 407945601, 676380929, 676380993, 810582273, 810598721, 677429569, 675299649, 541106433, 811630785, 675316097, 405848449, 811630913, 811630977, 806404225, 677421441
 };
 static const int SB10[64] = {
 	139501824, 676364608, 673227072, 810582336, 405840256, 408969536, 541114624, 810590528, 810557760, 675307776, 676331840, 811630848, 408994112, 676381056, 810598720, 537952576, 541114432, 674267392, 542162944, 677429568, 408985920, 677429504, 542155136, 676372864, 811639104, 407937344, 542146880, 811630976, 406896832, 675332416, 675316096, 674275712, 677421120, 810590528, 540066112, 408969536, 811647232, 407920960, 273727616, 677421312, 810582272, 676340096, 6332736, 671138176, 677421376, 677429568, 676372544, 540066176, 676372800, 536912192, 406872384, 676372800, 805347712, 810590464, 5284160, 274776448, 677413248, 541089984, 674283520, 541106560, 810598720, 137412992, 810598528, 811639168
 };
 static const int SB11[64] = {
 	810607041, 678486337, 945865089, 810607041, 673227201, 939565505, 946913729, 943767937, 946913729, 811647425, 944816449, 678478273, 811647361, 678478209, 812695937, 678453697, 943776065, 810598849, 944800129, 677437825, 678486401, 946913665, 946921793, 541114753, 945873345, 542163265, 544260417, 544244033, 939548801, 945865025, 678478145, 944824577, 812704129, 5300673, 946889089, 676389057, 941679041, 675340609, 809558465, 273735937, 678461889, 678478145, 812695873, 676381121, 678486465, 671138113, 810557825, 945856961, 944775489, 946921665, 946897345, 809533889, 812695937, 812687809, 812696001, 945865089, 676389249, 677413249, 945840449, 946913473, 943767937, 675332353, 676381121, 811647425
 };
 static const int SB12[64] = {
 	810598784, 811647296, 677404992, 809550144, 677429632, 811647296, 810582400, 675332480, 676381056, 810598720, 542163200, 543211840, 809533440, 673235328, 807444672, 675316096, 810582016, 541114560, 677396800, 810590528, 676381056, 138453376, 809550208, 810598784, 676372800, 810598784, 675332352, 542163328, 674242944, 677421440, 404799808, 542163328, 809542016, 809542016, 810598784, 139501952, 674283712, 541114752, 811622784, 676372672, 542155136, 543211904, 811639168, 811630912, 809542016, 676356480, 673218944, 811630976, 810598720, 810582208, 675307584, 810598784, 543203648, 542163264, 677404672, 811630784, 810590592, 810582400, 674275712, 810590528, 541098304, 675332416, 539001088, 811622784
 };
 static const int SB13[3] = {
 	273736128, 946913728, 675282944
 };
 #endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
 # Copyright lowRISC contributors.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Generate a baremetal application
 # Name of the program $(PROGRAM).c will be added as a source file
 PROGRAM = cifar10_dws_cnn
 PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 # Any extra source files to include in the build. Use the upper case .S
 # extension for assembly files
 EXTRA_SRCS :=
 include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,292 @@
 #include "simple_system_common.h"
 #include "cnn_weights.h"
 #include "fully_connected.h"
 #include "ibex_cnn_params.h"
 #include "ibex_inputs.h"
 #include "conv2d.h"
 #include "dws_conv.h"
 #define IMG_SZ 32
 #define NUM_FIL0 3
 #define FILTER1 3
 #define FILTER2 1
 #define FILTER3 3
 #define FILTER4 1
 #define FILTER5 3
 #define FILTER6 1
 #define FILTER7 3
 #define FILTER8 1
 #define FILTER9 3
 #define FILTER10 1
 #define FILTER11 3
 #define FILTER12 1
 #define NUM_FIL1 3
 #define NUM_FIL2 64
 #define NUM_FIL3 64
 #define NUM_FIL4 64
 #define NUM_FIL5 64
 #define NUM_FIL6 128
 #define NUM_FIL7 128
 #define NUM_FIL8 128
 #define NUM_FIL9 128
 #define NUM_FIL10 256
 #define NUM_FIL11 256
 #define NUM_FIL12 256
 #define STRIDE1 1
 #define STRIDE2 1
 #define STRIDE3 1
 #define STRIDE4 1
 #define STRIDE5 1
 #define STRIDE6 1
 #define STRIDE7 1
 #define STRIDE8 1
 #define STRIDE9 1
 #define STRIDE10 1
 #define STRIDE11 1
 #define STRIDE12 1
 #define PAD_TB1 1
 #define PAD_LR1 1
 #define PAD_TB2 0
 #define PAD_LR2 0
 #define PAD_TB3 1
 #define PAD_LR3 1
 #define PAD_TB4 0
 #define PAD_LR4 0
 #define PAD_TB5 1
 #define PAD_LR5 1
 #define PAD_TB6 0
 #define PAD_LR6 0
 #define PAD_TB7 1
 #define PAD_LR7 1
 #define PAD_TB8 0
 #define PAD_LR8 0
 #define PAD_TB9 1
 #define PAD_LR9 1
 #define PAD_TB10 0
 #define PAD_LR10 0
 #define PAD_TB11 1
 #define PAD_LR11 1
 #define PAD_TB12 0
 #define PAD_LR12 0
 #define POOL_STRIDE1 2
 #define POOL_SIZE1 2
 #define POOL_STRIDE2 2
 #define POOL_SIZE2 2
 #define POOL_STRIDE3 2
 #define POOL_SIZE3 2
 #define OUT_DIM 10
 #define SAMPLES 1
 int outs[SAMPLES][OUT_DIM];
 void cifar10_dws_cnn() {
 	int dout1 = NUM_FIL1;
 	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
 	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
 	int dout2 = NUM_FIL2;
 	int hout2 = ((hout1 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
 	int wout2 = ((wout1 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
 	int dout3 = NUM_FIL3;
 	int hout3 = ((hout2 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
 	int wout3 = ((wout2 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
 	int dout4 = NUM_FIL4;
 	int hout4 = ((hout3 - FILTER4+ 2 * PAD_TB4)/STRIDE4)+1;
 	int wout4 = ((wout3 - FILTER4+ 2 * PAD_LR4)/STRIDE4)+1;
 	int dout5 = dout4;
 	int hout5 = hout4/POOL_STRIDE1;
 	int wout5 = wout4/POOL_STRIDE1;
 	int dout6 = NUM_FIL5;
 	int hout6 = ((hout5 - FILTER5+ 2 * PAD_TB5)/STRIDE5)+1;
 	int wout6 = ((wout5 - FILTER5+ 2 * PAD_LR5)/STRIDE5)+1;
 	int dout7 = NUM_FIL6;
 	int hout7 = ((hout6 - FILTER6+ 2 * PAD_TB6)/STRIDE6)+1;
 	int wout7 = ((wout6 - FILTER6+ 2 * PAD_LR6)/STRIDE6)+1;
 	int dout8 = NUM_FIL7;
 	int hout8 = ((hout7 - FILTER7+ 2 * PAD_TB7)/STRIDE7)+1;
 	int wout8 = ((wout7 - FILTER7+ 2 * PAD_LR7)/STRIDE7)+1;
 	int dout9 = NUM_FIL8;
 	int hout9 = ((hout8 - FILTER8+ 2 * PAD_TB8)/STRIDE8)+1;
 	int wout9 = ((wout8 - FILTER8+ 2 * PAD_LR8)/STRIDE8)+1;
 	int dout10 = dout9;
 	int hout10 = hout9/POOL_STRIDE2;
 	int wout10 = wout9/POOL_STRIDE2;
 	int dout11 = NUM_FIL9;
 	int hout11 = ((hout10 - FILTER9+ 2 * PAD_TB9)/STRIDE9)+1;
 	int wout11 = ((wout10 - FILTER9+ 2 * PAD_LR9)/STRIDE9)+1;
 	int dout12 = NUM_FIL10;
 	int hout12 = ((hout11 - FILTER10+ 2 * PAD_TB10)/STRIDE10)+1;
 	int wout12 = ((wout11 - FILTER10+ 2 * PAD_LR10)/STRIDE10)+1;
 	int dout13 = NUM_FIL11;
 	int hout13 = ((hout12 - FILTER11+ 2 * PAD_TB11)/STRIDE11)+1;
 	int wout13 = ((wout12 - FILTER11+ 2 * PAD_LR11)/STRIDE11)+1;
 	int dout14 = NUM_FIL12;
 	int hout14 = ((hout13 - FILTER12+ 2 * PAD_TB12)/STRIDE12)+1;
 	int wout14 = ((wout13 - FILTER12+ 2 * PAD_LR12)/STRIDE12)+1;
 	int dout15 = dout14;
 	int hout15 = hout14/POOL_STRIDE3;
 	int wout15 = wout14/POOL_STRIDE3;
 	int flatten_dim = dout15 * hout15 * wout15;
 	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
 	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
 	int out1[hout1][wout1][dout1];
 	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
 	int outp_dim1[3] = {hout1, wout1, dout1};
 	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
 	int out2[hout2][wout2][dout2];
 	int pad_2[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
 	int outp_dim2[3] = {hout2, wout2, dout2};
 	int f_dim2[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
 	int out3[hout3][wout3][dout3];
 	int pad_3[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
 	int outp_dim3[3] = {hout3, wout3, dout3};
 	int f_dim3[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
 	int out4[hout4][wout4][dout4];
 	int pad_4[4] = {PAD_TB4, PAD_TB4, PAD_LR4, PAD_LR4};
 	int outp_dim4[3] = {hout4, wout4, dout4};
 	int f_dim4[4] = {NUM_FIL4, FILTER4, FILTER4, NUM_FIL3};
 	int out5[hout5][wout5][dout5];
 	int outp_dim5[3] = {hout5, wout5, dout5};
 	int out6[hout6][wout6][dout6];
 	int pad_6[4] = {PAD_TB5, PAD_TB5, PAD_LR5, PAD_LR5};
 	int outp_dim6[3] = {hout6, wout6, dout6};
 	int f_dim6[4] = {NUM_FIL5, FILTER5, FILTER5, NUM_FIL4};
 	int out7[hout7][wout7][dout7];
 	int pad_7[4] = {PAD_TB6, PAD_TB6, PAD_LR6, PAD_LR6};
 	int outp_dim7[3] = {hout7, wout7, dout7};
 	int f_dim7[4] = {NUM_FIL6, FILTER6, FILTER6, NUM_FIL5};
 	int out8[hout8][wout8][dout8];
 	int pad_8[4] = {PAD_TB7, PAD_TB7, PAD_LR7, PAD_LR7};
 	int outp_dim8[3] = {hout8, wout8, dout8};
 	int f_dim8[4] = {NUM_FIL7, FILTER7, FILTER7, NUM_FIL6};
 	int out9[hout9][wout9][dout9];
 	int pad_9[4] = {PAD_TB8, PAD_TB8, PAD_LR8, PAD_LR8};
 	int outp_dim9[3] = {hout9, wout9, dout9};
 	int f_dim9[4] = {NUM_FIL8, FILTER8, FILTER8, NUM_FIL7};
 	int out10[hout10][wout10][dout10];
 	int outp_dim10[3] = {hout10, wout10, dout10};
 	int out11[hout11][wout11][dout11];
 	int pad_11[4] = {PAD_TB9, PAD_TB9, PAD_LR9, PAD_LR9};
 	int outp_dim11[3] = {hout11, wout11, dout11};
 	int f_dim11[4] = {NUM_FIL9, FILTER9, FILTER9, NUM_FIL8};
 	int out12[hout12][wout12][dout12];
 	int pad_12[4] = {PAD_TB10, PAD_TB10, PAD_LR10, PAD_LR10};
 	int outp_dim12[3] = {hout12, wout12, dout12};
 	int f_dim12[4] = {NUM_FIL10, FILTER10, FILTER10, NUM_FIL9};
 	int out13[hout13][wout13][dout13];
 	int pad_13[4] = {PAD_TB11, PAD_TB11, PAD_LR11, PAD_LR11};
 	int outp_dim13[3] = {hout13, wout13, dout13};
 	int f_dim13[4] = {NUM_FIL11, FILTER11, FILTER11, NUM_FIL10};
 	int out14[hout14][wout14][dout14];
 	int pad_14[4] = {PAD_TB12, PAD_TB12, PAD_LR12, PAD_LR12};
 	int outp_dim14[3] = {hout14, wout14, dout14};
 	int f_dim14[4] = {NUM_FIL12, FILTER12, FILTER12, NUM_FIL11};
 	int out15[hout15][wout15][dout15];
 	int outp_dim15[3] = {hout15, wout15, dout15};
 	int out16[flatten_dim];
 	int out[OUT_DIM];
 	for (int iter = 0; iter < SAMPLES; iter++){
 		for(int i = 0; i < IMG_SZ; i++){
 			for(int j = 0; j < IMG_SZ; j++){
 				for(int k = 0; k < NUM_FIL0; k++){
 					in[i][j][k] = input[i][j][k][iter];
 				}
 			}
 		}
 		pcount_enable(1);
 		dw_conv(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
 		pw_conv(outp_dim1, f_dim2, outp_dim2, out1, F2, B2, out2, STRIDE2, pad_2, SB2, MV2, SV2);
 		dw_conv(outp_dim2, f_dim3, outp_dim3, out2, F3, B3, out3, STRIDE3, pad_3, SB3, MV3, SV3);
 		pw_conv(outp_dim3, f_dim4, outp_dim4, out3, F4, B4, out4, STRIDE4, pad_4, SB4, MV4, SV4);
 		maxpool2(outp_dim4, outp_dim5, out4, out5, POOL_SIZE1, POOL_STRIDE1);
 		dw_conv(outp_dim5, f_dim6, outp_dim6, out5, F5, B5, out6, STRIDE5, pad_6, SB5, MV5, SV5);
 		pw_conv(outp_dim6, f_dim7, outp_dim7, out6, F6, B6, out7, STRIDE6, pad_7, SB6, MV6, SV6);
 		dw_conv(outp_dim7, f_dim8, outp_dim8, out7, F7, B7, out8, STRIDE7, pad_8, SB7, MV7, SV7);
 		pw_conv(outp_dim8, f_dim9, outp_dim9, out8, F8, B8, out9, STRIDE8, pad_9, SB8, MV8, SV8);
 		maxpool2(outp_dim9, outp_dim10, out9, out10, POOL_SIZE2, POOL_STRIDE2);
 		dw_conv(outp_dim10, f_dim11, outp_dim11, out10, F9, B9, out11, STRIDE9, pad_11, SB9, MV9, SV9);
 		pw_conv(outp_dim11, f_dim12, outp_dim12, out11, F10, B10, out12, STRIDE10, pad_12, SB10, MV10, SV10);
 		dw_conv(outp_dim12, f_dim13, outp_dim13, out12, F11, B11, out13, STRIDE11, pad_13, SB11, MV11, SV11);
 		pw_conv(outp_dim13, f_dim14, outp_dim14, out13, F12, B12, out14, STRIDE12, pad_14, SB12, MV12, SV12);
 		maxpool2(outp_dim14, outp_dim15, out14, out15, POOL_SIZE3, POOL_STRIDE3);
 		flatten(outp_dim15, out15, out16);
 		mlp_layer(out16, out, flatten_dim, OUT_DIM, W1, B13, SB13, MV13, SV13);
 		pcount_enable(0);
 		puts("Output Layer Values:\n");
 		for(int i = 0; i < OUT_DIM; i++) {
 			puthex(out[i]);
 			puts("\n");
 		}
 	}
 }
 int main(void) {
 	pcount_enable(0);
 	cifar10_dws_cnn();
 	return 0;
 }
@@ -0,0 +1,46 @@
 #ifndef IBEX_CNN_PARAMS_H
 #define IBEX_CNN_PARAMS_H
 #define MV1 75
 #define MV2 112
 #define MV3 82
 #define MV4 75
 #define MV5 87
 #define MV6 76
 #define MV7 111
 #define MV8 95
 #define MV9 95
 #define MV10 117
 #define MV11 68
 #define MV12 91
 #define MV13 77
 #define SV1 15
 #define SV2 7
 #define SV3 15
 #define SV4 14
 #define SV5 14
 #define SV6 8
 #define SV7 15
 #define SV8 15
 #define SV9 15
 #define SV10 16
 #define SV11 15
 #define SV12 15
 #define SV13 21
 #define SB1 0
 #define SB2 0
 #define SB3 0
 #define SB4 0
 #define SB5 0
 #define SB6 0
 #define SB7 0
 #define SB8 0
 #define SB9 0
 #define SB10 0
 #define SB11 0
 #define SB12 0
 #define SB13 0
 #endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
 # Copyright lowRISC contributors.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Generate a baremetal application
 # Name of the program $(PROGRAM).c will be added as a source file
 PROGRAM = cmsis_cnn
 PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 # Any extra source files to include in the build. Use the upper case .S
 # extension for assembly files
 EXTRA_SRCS :=
 include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,153 @@
 #include "simple_system_common.h"
 #include "cnn_weights.h"
 #include "fully_connected_opt.h"
 #include "ibex_cnn_params.h"
 #include "ibex_inputs.h"
 #include "conv2d_opt.h"
 #define IMG_SZ 32
 #define NUM_FIL0 1
 #define FILTER1 5
 #define FILTER2 5
 #define FILTER3 5
 #define NUM_FIL1 8
 #define NUM_FIL2 8
 #define NUM_FIL3 16
 #define STRIDE1 1
 #define STRIDE2 1
 #define STRIDE3 1
 #define PAD_TB1 2
 #define PAD_LR1 2
 #define PAD_TB2 2
 #define PAD_LR2 2
 #define PAD_TB3 2
 #define PAD_LR3 2
 #define POOL_STRIDE1 2
 #define POOL_SIZE1 2
 #define POOL_STRIDE2 2
 #define POOL_SIZE2 2
 #define POOL_STRIDE3 2
 #define POOL_SIZE3 2
 #define OUT_DIM 3
 #define SAMPLES 1
 int outs[SAMPLES][OUT_DIM];
 void cmsis_cnn() {
 	int dout1 = NUM_FIL1;
 	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
 	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
 	int dout2 = dout1;
 	int hout2 = hout1/POOL_STRIDE1;
 	int wout2 = wout1/POOL_STRIDE1;
 	int dout3 = NUM_FIL2;
 	int hout3 = ((hout2 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
 	int wout3 = ((wout2 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
 	int dout4 = dout3;
 	int hout4 = hout3/POOL_STRIDE2;
 	int wout4 = wout3/POOL_STRIDE2;
 	int dout5 = NUM_FIL3;
 	int hout5 = ((hout4 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
 	int wout5 = ((wout4 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
 	int dout6 = dout5;
 	int hout6 = hout5/POOL_STRIDE3;
 	int wout6 = wout5/POOL_STRIDE3;
 	int flatten_dim = dout6 * hout6 * wout6;
 	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
 	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
 	int out1[hout1][wout1][dout1];
 	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
 	int outp_dim1[3] = {hout1, wout1, dout1};
 	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
 	int out2[hout2][wout2][dout2];
 	int outp_dim2[3] = {hout2, wout2, dout2};
 	int out3[hout3][wout3][dout3];
 	int pad_3[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
 	int outp_dim3[3] = {hout3, wout3, dout3};
 	int f_dim3[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
 	int out4[hout4][wout4][dout4];
 	int outp_dim4[3] = {hout4, wout4, dout4};
 	int out5[hout5][wout5][dout5];
 	int pad_5[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
 	int outp_dim5[3] = {hout5, wout5, dout5};
 	int f_dim5[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
 	int out6[hout6][wout6][dout6];
 	int outp_dim6[3] = {hout6, wout6, dout6};
 	int out7[flatten_dim];
 	int out[OUT_DIM];
 	for (int iter = 0; iter < SAMPLES; iter++){
 		for(int i = 0; i < IMG_SZ; i++){
 			for(int j = 0; j < IMG_SZ; j++){
 				for(int k = 0; k < NUM_FIL0; k++){
 					in[i][j][k] = input[i][j][k][iter];
 				}
 			}
 		}
 		pcount_enable(1);
 		conv2_8bits_1ch(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
 		maxpool2_compressed(outp_dim1, outp_dim2, out1, out2, POOL_SIZE1, POOL_STRIDE1);
 		conv2_8bits(outp_dim2, f_dim3, outp_dim3, out2, F2, B2, out3, STRIDE2, pad_3, SB2, MV2, SV2);
 		maxpool2_compressed(outp_dim3, outp_dim4, out3, out4, POOL_SIZE2, POOL_STRIDE2);
 		conv2_2bits(outp_dim4, f_dim5, outp_dim5, out4, F3, B3, out5, STRIDE3, pad_5, SB3, MV3, SV3);
 		maxpool2_compressed(outp_dim5, outp_dim6, out5, out6, POOL_SIZE3, POOL_STRIDE3);
 		flatten(outp_dim6, out6, out7);
 		mlp_layer_2bits(out7, out, flatten_dim, OUT_DIM, W1, B4, SB4, MV4, SV4);
 		pcount_enable(0);
 		puts("Output Layer Values:\n");
 		for(int i = 0; i < OUT_DIM; i++) {
 			puthex((out[i] & 0xFF000000) >> 24);
 			puts(" ");
 			puthex((out[i] & 0xFF0000) >> 16);
 			puts(" ");
 			puthex((out[i] & 0xFF00) >> 8);
 			puts(" ");
 			puthex(out[i] & 0xFF);
 			puts("\n");
 		}
 	}
 }
 int main(void) {
 	pcount_enable(0);
 	cmsis_cnn();
 	return 0;
 }
@@ -0,0 +1,30 @@
 #ifndef IBEX_CNN_PARAMS_H
 #define IBEX_CNN_PARAMS_H
 #define MV1 1953789044
 #define MV2 1229539657
 #define MV3 1212696648
 #define MV4 1330597711
 #define SV1 2164392961
 #define SV2 2299667521
 #define SV3 1488020161
 #define SV4 1623294721
 static const int SB1[8] = {
 	812696004, 946880900, 1079034308, 946913796, 945865156, 1081139524, 946930052, 545309060
 };
 static const int SB2[8] = {
 	945873216, 945832320, 945865152, 944816576, 674283904, 543211776, 945873280, 944824704
 };
 static const int SB3[16] = {
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 };
 static const int SB4[3] = {
 	3, 3, 3
 };
 #endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
 # Copyright lowRISC contributors.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Generate a baremetal application
 # Name of the program $(PROGRAM).c will be added as a source file
 PROGRAM = cmsis_cnn
 PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 # Any extra source files to include in the build. Use the upper case .S
 # extension for assembly files
 EXTRA_SRCS :=
 include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,147 @@
 #include "simple_system_common.h"
 #include "cnn_weights.h"
 #include "fully_connected.h"
 #include "ibex_cnn_params.h"
 #include "ibex_inputs.h"
 #include "conv2d.h"
 #define IMG_SZ 32
 #define NUM_FIL0 3
 #define FILTER1 5
 #define FILTER2 5
 #define FILTER3 5
 #define NUM_FIL1 32
 #define NUM_FIL2 32
 #define NUM_FIL3 64
 #define STRIDE1 1
 #define STRIDE2 1
 #define STRIDE3 1
 #define PAD_TB1 2
 #define PAD_LR1 2
 #define PAD_TB2 2
 #define PAD_LR2 2
 #define PAD_TB3 2
 #define PAD_LR3 2
 #define POOL_STRIDE1 2
 #define POOL_SIZE1 2
 #define POOL_STRIDE2 2
 #define POOL_SIZE2 2
 #define POOL_STRIDE3 2
 #define POOL_SIZE3 2
 #define OUT_DIM 10
 #define SAMPLES 1
 int outs[SAMPLES][OUT_DIM];
 void cmsis_cnn() {
 	int dout1 = NUM_FIL1;
 	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
 	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
 	int dout2 = dout1;
 	int hout2 = hout1/POOL_STRIDE1;
 	int wout2 = wout1/POOL_STRIDE1;
 	int dout3 = NUM_FIL2;
 	int hout3 = ((hout2 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
 	int wout3 = ((wout2 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
 	int dout4 = dout3;
 	int hout4 = hout3/POOL_STRIDE2;
 	int wout4 = wout3/POOL_STRIDE2;
 	int dout5 = NUM_FIL3;
 	int hout5 = ((hout4 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
 	int wout5 = ((wout4 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
 	int dout6 = dout5;
 	int hout6 = hout5/POOL_STRIDE3;
 	int wout6 = wout5/POOL_STRIDE3;
 	int flatten_dim = dout6 * hout6 * wout6;
 	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
 	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
 	int out1[hout1][wout1][dout1];
 	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
 	int outp_dim1[3] = {hout1, wout1, dout1};
 	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
 	int out2[hout2][wout2][dout2];
 	int outp_dim2[3] = {hout2, wout2, dout2};
 	int out3[hout3][wout3][dout3];
 	int pad_3[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
 	int outp_dim3[3] = {hout3, wout3, dout3};
 	int f_dim3[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
 	int out4[hout4][wout4][dout4];
 	int outp_dim4[3] = {hout4, wout4, dout4};
 	int out5[hout5][wout5][dout5];
 	int pad_5[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
 	int outp_dim5[3] = {hout5, wout5, dout5};
 	int f_dim5[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
 	int out6[hout6][wout6][dout6];
 	int outp_dim6[3] = {hout6, wout6, dout6};
 	int out7[flatten_dim];
 	int out[OUT_DIM];
 	for (int iter = 0; iter < SAMPLES; iter++){
 		for(int i = 0; i < IMG_SZ; i++){
 			for(int j = 0; j < IMG_SZ; j++){
 				for(int k = 0; k < NUM_FIL0; k++){
 					in[i][j][k] = input[i][j][k][iter];
 				}
 			}
 		}
 		pcount_enable(1);
 		conv2(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
 		maxpool2(outp_dim1, outp_dim2, out1, out2, POOL_SIZE1, POOL_STRIDE1);
 		conv2(outp_dim2, f_dim3, outp_dim3, out2, F2, B2, out3, STRIDE2, pad_3, SB2, MV2, SV2);
 		maxpool2(outp_dim3, outp_dim4, out3, out4, POOL_SIZE2, POOL_STRIDE2);
 		conv2(outp_dim4, f_dim5, outp_dim5, out4, F3, B3, out5, STRIDE3, pad_5, SB3, MV3, SV3);
 		maxpool2(outp_dim5, outp_dim6, out5, out6, POOL_SIZE3, POOL_STRIDE3);
 		flatten(outp_dim6, out6, out7);
 		mlp_layer(out7, out, flatten_dim, OUT_DIM, W1, B4, SB4, MV4, SV4);
 		pcount_enable(0);
 		puts("Output Layer Values:\n");
 		for(int i = 0; i < OUT_DIM; i++) {
 			puthex(out[i]);
 			puts("\n");
 		}
 	}
 }
 int main(void) {
 	pcount_enable(0);
 	cmsis_cnn();
 	return 0;
 }
@@ -0,0 +1,19 @@
 #ifndef IBEX_CNN_PARAMS_H
 #define IBEX_CNN_PARAMS_H
 #define MV1 116
 #define MV2 73
 #define MV3 72
 #define MV4 79
 #define SV1 16
 #define SV2 17
 #define SV3 11
 #define SV4 12
 #define SB1 0
 #define SB2 0
 #define SB3 0
 #define SB4 0
 #endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,70 @@
 #ifndef DWS_CONV_H
 #define DWS_CONV_H
 void pw_conv(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
 	      const int fil[fil_dim[0]][fil_dim[3]], const int bias[], 
 	      int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], 
              const int bias_shift_mode, const int quantized_multiplier, const int out_shift_rl){
     int i, j, k, m, res, str1, str2, quant_prod;
     for (i = 0; i < out_dim[2]; i++) {   // output depth
           str1 = -pad[0] - strides;
           for (j = 0; j < out_dim[0]; j++) {  // output height
 	        str1 += strides;
 	        str2 = -pad[2] - strides;
 	        for (k = 0; k < out_dim[1]; k++) {  // output width
                    res = bias[i];
                      str2 += strides;
                      if (str1 < in_dim[0] && str1 >= 0 && str2 >= 0 && str2 < in_dim[1]) { 
 		           for (m = 0; m < fil_dim[3]; m++) {   // filters depth
                                res += inp[str1][str2][m] * fil[i][m];
                          }
                      }
                      quant_prod = quantized_multiplier * res + (1 << (out_shift_rl -1));
        	      quant_prod = quant_prod >> (out_shift_rl);
        	      if(quant_prod < 0) quant_prod = 0;
        	      if(quant_prod > 255) quant_prod = 255;
                      out[j][k][i] = quant_prod;
 	       }
          }
     }
 }
 void dw_conv(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
            const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]][1], const int bias[], 
 			int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], 
            const int bias_shift_mode, const int depthwise_multiplier, const int depthwise_out_shift_rl){
 	int i, j, k, n, p, res, k1, k2, str1, str2, quant_prod;
 	// Depthwise convolution
 	for (i = 0; i < out_dim[2]; i++) {   // output depth
 		str1 = -pad[0] - strides;
 		for (j = 0; j < out_dim[0]; j++) {  // output height
 			str1 += strides;
 			str2 = -pad[2] - strides;
 			for (k = 0; k < out_dim[1]; k++) {  // output width
 				res = bias[i];
 				str2 += strides;
 				for (p = 0; p < depthwise_fil_dim[1]; p++){  // depthwise filter height
 					for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
 						k1 = str1 + p; 
 						k2 = str2 + n;
 						if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
 							res += inp[k1][k2][i] * depthwise_fil[i][p][n][0];
 						}
 					}
 				}
 				quant_prod = depthwise_multiplier * res + (1 << (depthwise_out_shift_rl -1));
 		        quant_prod = quant_prod >> (depthwise_out_shift_rl);
 				if(quant_prod < 0) quant_prod = 0;
        		if(quant_prod > 255) quant_prod = 255;
                out[j][k][i] = quant_prod;
            }
 		}
 	}
 }
 #endif  /* DWS_CONV_H */
@@ -0,0 +1,171 @@
 #ifndef DWS_CONV_OPT_H
 #define DWS_CONV_OPT_H
 void pw_conv_8bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
 	const int fil[fil_dim[0]][fil_dim[3] << 2], const int bias[fil_dim[0]], 
 	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
 	const int quantized_multiplier, const int out_shift_rl){
 	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
 	for (i = 0; i < out_dim[2]; i++) {   // output depth
 		str1 = -pad[0] - strides;
 		for (j = 0; j < out_dim[0]; j++) {  // output height
 			str1 += 1;
 			str2 = -pad[2] - strides;
 	        	for (k = 0; k < out_dim[1]; k++) {  // output width
                		bias_val = bias[i];
                		str2 += 1;
                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
 				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
                        		in_cnn = inp[str1][str2][m];
                            		w = fil[i][4*m];
                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                            		w = fil[i][4*m+1];
                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                            		w = fil[i][4*m+2];
                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                            		w = fil[i][4*m+3];
                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                		}
                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
                    		out[j][k][i] = res;
 			}
        	}
 	}
 }
 void pw_conv_4bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
 	const int fil[fil_dim[0]][fil_dim[3] << 1], const int bias[fil_dim[0]], 
 	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
 	const int quantized_multiplier, const int out_shift_rl){
 	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
 	for (i = 0; i < out_dim[2]; i++) {   // output depth
 		str1 = -pad[0] - strides;
 		for (j = 0; j < out_dim[0]; j++) {  // output height
 			str1 += 1;
 			str2 = -pad[2] - strides;
 	        	for (k = 0; k < out_dim[1]; k++) {  // output width
                		bias_val = bias[i];
                		str2 += 1;
                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
 				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
                        		in_cnn = inp[str1][str2][m];
                            		w = fil[i][2*m];
                            		asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                            		w = fil[i][2*m+1];
                            		asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                    		}
                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
                    		out[j][k][i] = res;
 			}
        	}
 	}
 }
 void pw_conv_2bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
 	const int fil[fil_dim[0]][fil_dim[3]], const int bias[fil_dim[0]], 
 	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
 	const int quantized_multiplier, const int out_shift_rl){
 	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
 	for (i = 0; i < out_dim[2]; i++) {   // output depth
 		str1 = -pad[0] - strides;
 		for (j = 0; j < out_dim[0]; j++) {  // output height
 			str1 += 1;
 			str2 = -pad[2] - strides;
 	        	for (k = 0; k < out_dim[1]; k++) {  // output width
                		bias_val = bias[i];
                		str2 += 1;
                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
 				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
                        		in_cnn = inp[str1][str2][m];
                            		w = fil[i][m];
                            		asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                    		}
                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
                    		out[j][k][i] = res;
 			}
        	}
 	}
 }
 void dw_conv_opt(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3],
 	int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]],
 	const int bias[depthwise_fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]],
 	int strides, int pad[], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
 	int i, j, k, n, p, res, k1, k2, str1, str2, bias_val, in_cnn, w;
    	// Depthwise convolution
    	for (i = 0; i < out_dim[2]; i++){   // output depth
        	str1 = -pad[0] - strides;
        	for (j = 0; j < out_dim[0]; j++) {  // output height
            		str1 += strides;
            		str2 = -pad[2] - strides;
            		for (k = 0; k < out_dim[1]; k++) {  // output width
                		bias_val = bias[i];
                		str2 += strides;
                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
                		for (p = 0; p < depthwise_fil_dim[1]; p++) {  // depthwise filter height
                    			for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
                        			k1 = str1 + p; 
                        			k2 = str2 + n;
                        			if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
                            				in_cnn = inp[k1][k2][i];
                            				w = depthwise_fil[i][p][n];
                            				asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                        			}
                    			}
                		}
                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
                    		out[j][k][i] = res;
            		}
        	}
    	}
 }
 void dw_conv_opt_1ch(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3],
 	int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]],
 	const int bias[depthwise_fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]],
 	int strides, int pad[], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
        int j, k, n, p, res, k1, k2, str1, str2, bias_val, in_cnn, w;
    	// Depthwise convolution
        str1 = -pad[0] - strides;
        for (j = 0; j < out_dim[0]; j++) {  // output height
             str1 += strides;
             str2 = -pad[2] - strides;
             for (k = 0; k < out_dim[1]; k++) {  // output width
                  bias_val = bias[0];
                  str2 += strides;
                  asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[0]):);
                  for (p = 0; p < depthwise_fil_dim[1]; p++) {  // depthwise filter height
                       for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
                            k1 = str1 + p; 
                            k2 = str2 + n;
                            if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
                                 in_cnn = inp[k1][k2][0];
                            	 w = depthwise_fil[0][p][n];
                            	 asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
                             }
                        }
                   }
                   asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
                   out[j][k][0] = res;
           }
      }
 }
 #endif  /* DWS_CONV_OPT_H */
@@ -0,0 +1,89 @@
 import init_utils
 import common
 # Initialize the environment and get the name
 name = init_utils.initialize_environment(__file__)
 args = init_utils.get_args()
 # Set arguments from command line
 max_acc_drop = args.max_acc_drop
 device = args.device
 from sklearn.model_selection import train_test_split
 import torch.nn as nn
 import torch.nn.functional as F
 import tensorflow as tf
 import numpy as np
 # Load our Dataset
 (X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()
 y_train = np.squeeze(y_train, axis = 1)
 y_test = np.squeeze(y_test, axis = 1)
 X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.15)
 X_train = (np.transpose(X_train, (0,3,1,2)))
 X_test = (np.transpose(X_test, (0,3,1,2)))
 X_val = (np.transpose(X_val, (0,3,1,2)))
 BATCH_SIZE = 128
 epochs = 1
 lr = 0.0001
 class DepthwiseBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DepthwiseBlock, self).__init__()
        layers = []
        layers.append(nn.Conv2d(in_channels = in_channels, out_channels = in_channels, 
                                    kernel_size = 3, padding = 1, groups = in_channels))  # Depthwise convolution
        layers.append(nn.ReLU(inplace = True))
        layers.append(nn.Conv2d(in_channels = in_channels, out_channels = out_channels, 
                                    kernel_size = 1, padding = 0))  # Pointwise convolution
        layers.append(nn.ReLU(inplace = True))
        self.block = nn.Sequential(*layers)
    def forward(self, x):
        return self.block(x)
 class Cifar10_Dws_CNN(nn.Module):
    def __init__(self):
        super(Cifar10_Dws_CNN, self).__init__()
        self.features = nn.Sequential(
            DepthwiseBlock(in_channels = 3, out_channels = 64),
            DepthwiseBlock(in_channels = 64, out_channels = 64),
            nn.MaxPool2d(kernel_size = 2, stride = 2),
            DepthwiseBlock(in_channels = 64, out_channels = 128),
            DepthwiseBlock(in_channels = 128, out_channels = 128),
            nn.MaxPool2d(kernel_size = 2, stride = 2),
            DepthwiseBlock(in_channels = 128, out_channels = 256),
            DepthwiseBlock(in_channels = 256, out_channels = 256),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(256 * 4 * 4, 10)  # Assuming input size is (32, 32) and after 3 max pooling layers, the size is (4, 4)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return F.log_softmax(x, dim = 1)
 net = Cifar10_Dws_CNN()
 common.create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, 
                X_val = X_val, y_val = y_val, BATCH_SIZE = BATCH_SIZE, 
                epochs = epochs, lr = lr, max_acc_drop = max_acc_drop)
@@ -0,0 +1,77 @@
 import init_utils
 import common
 # Initialize the environment and get the name
 name = init_utils.initialize_environment(__file__)
 args = init_utils.get_args()
 # Set arguments from command line
 max_acc_drop = args.max_acc_drop
 device = args.device
 from sklearn.model_selection import train_test_split
 import torch.nn as nn
 import torch.nn.functional as F
 import tensorflow as tf
 import numpy as np
 # Load our Dataset
 (X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()
 y_train = np.squeeze(y_train, axis = 1)
 y_test = np.squeeze(y_test, axis = 1)
 X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.15)
 X_train = (np.transpose(X_train, (0,3,1,2)) - 128.0)/255.0
 X_test = (np.transpose(X_test, (0,3,1,2)) - 128.0)/255.0
 X_val = (np.transpose(X_val, (0,3,1,2)) - 128.0)/255.0
 BATCH_SIZE = 32
 epochs = 1
 lr = 0.0001
 class CMSIS_CNN(nn.Module):
    def __init__(self):
        super(CMSIS_CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 5, padding = 2)
        self.relu1 = nn.ReLU()
        self.max1 = nn.MaxPool2d(2,2)
        self.d1 = nn.Dropout(p = 0.25)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 5, padding = 2)
        self.relu2 = nn.ReLU()
        self.max2 = nn.MaxPool2d(2,2)
        self.d2 = nn.Dropout(p = 0.25)
        self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, padding = 2)
        self.relu3 = nn.ReLU()
        self.max3 = nn.MaxPool2d(2,2)
        self.d3 = nn.Dropout(p = 0.4)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(1024, 10)
    def forward(self,X):
        X = self.relu1((self.conv1(X)))
        X = self.max1(X)
        X = self.d1(X)
        X = self.relu2((self.conv2(X)))
        X = self.max2(X)
        X = self.d2(X)
        X = self.relu3((self.conv3(X)))
        X = self.max3(X)
        X = self.d3(X)
        X = self.flatten(X)
        X = self.linear1(X)
        return F.log_softmax(X, dim = 1)
 net = CMSIS_CNN()
 common.create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, 
                X_val = X_val, y_val = y_val, BATCH_SIZE = BATCH_SIZE, 
                epochs = epochs, lr = lr, max_acc_drop = max_acc_drop)
@@ -125,3 +125,13 @@ def create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, X_val =
        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
        ibex_model = simulate_ibex.create_lenet_model(int_weights, int_og_bias, mul_vals, shift_vals)
        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
    elif(name == 'cmsis_cnn'):
        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
        ibex_model = simulate_ibex.create_cmsis_cnn_model(int_weights, int_og_bias, mul_vals, shift_vals)
        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
    elif(name == 'cifar10_dws_cnn'):
        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
        ibex_model = simulate_ibex.create_ibex_dws_model(int_weights, int_og_bias, mul_vals, shift_vals)
        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
@@ -27,26 +27,34 @@ def quantize_multiplier(real_multiplier):
    return quantized_multiplier, right_shift
 def get_int_params(quant_net):
    int_weights = []
    int_bias = []
    in_scales = []
    act_scales = []
-    for _, module in quant_net.sequential.named_children():
+    def extract_quant_params(module):
-        if hasattr(module, 'weight') and module.weight is not None:
+        for name, submodule in module.named_children():
-            int_weights.append(module.int_weight().cpu().numpy())
+            # Check if the submodule has weights and append them if present
-            int_bias.append(module.int_bias().cpu().numpy())
+            if hasattr(submodule, 'weight') and submodule.weight is not None:
-            in_scales.append(module.quant_bias_scale().cpu().detach().numpy())
+                int_weights.append(submodule.int_weight().cpu().detach().numpy())
                int_bias.append(submodule.int_bias().cpu().detach().numpy())
                in_scales.append(submodule.quant_bias_scale().cpu().detach().numpy())
-        if hasattr(module, 'quant_act_scale') and module.quant_act_scale is not None:
+            # Check if the submodule has activation scale and append it if present
-            act_scales.append(module.quant_act_scale().cpu().detach().numpy())
+            if hasattr(submodule, 'quant_act_scale') and submodule.quant_act_scale is not None:
                act_scales.append(submodule.quant_act_scale().cpu().detach().numpy())
-    act_scales.append(quant_net.o_quant.quant_act_scale().cpu().detach().numpy())
+            # Recursively extract parameters from the children modules
            extract_quant_params(submodule)
    # Start extraction from the top-level module
    extract_quant_params(quant_net)
    mul_vals, shift_vals = [], []
-    for i in range(len(act_scales)):
+    for i in range(len(act_scales)-1):
-        M = in_scales[i]/act_scales[i]
+        M = in_scales[i]/act_scales[i+1]
        mul, shift = quantize_multiplier(M[0])
        mul_vals.append(mul)
        shift_vals.append(shift)
@@ -87,7 +95,12 @@ def decide_mode(network, weight_bit_width, input_uint8 = True):
    for name, module in network.named_modules():
        if isinstance(module, layer_types_py):
            layer_type_name = module.__class__.__name__
-            if(layer_type_name == 'Conv2d' or layer_type_name == 'Linear' or layer_type_name == 'DepthwiseConv2d'):
+            if(layer_type_name == 'Linear'):
                layer_type.append(layer_type_name)
            if(layer_type_name == 'Conv2d'):
                if(module.groups == module.in_channels):
                    layer_type.append('DepthwiseConv2d')
                else:
                    layer_type.append(layer_type_name)
            else:
                if(layer_type_name == 'ReLU' or layer_type_name == 'Sigmoid'):
@@ -96,13 +109,13 @@ def decide_mode(network, weight_bit_width, input_uint8 = True):
    for i in range(len(weight_bit_width)):
        signed_input = 4 * input_sign[i]
        if(layer_type[i] == 'DepthwiseConv2d'):
                mode_per_layer.append(signed_input + 1)
        else:
            if(weight_bit_width[i] == 2):
                mode_per_layer.append(signed_input + 3)
            elif(weight_bit_width[i] == 4):
                mode_per_layer.append(signed_input + 2)
        else:
            if(layer_type[i] == 'DepthwiseConv2d'):
                mode_per_layer.append(signed_input + 1)
            else:
                mode_per_layer.append(signed_input)
@@ -161,6 +174,7 @@ def pad_inputs_weights(quant_net, test_loader, mode_per_layer,
            else:
                new_size_0 = a * 4
            if((mode_per_layer[i] != 1) and (mode_per_layer[i] != 5)):
                b = w.shape[1] // 4
                if(w.shape[1] % 4 != 0):
                    new_size_1 = (b + 1) * 4
@@ -170,6 +184,12 @@ def pad_inputs_weights(quant_net, test_loader, mode_per_layer,
                new_w = np.zeros((new_size_0, new_size_1, w.shape[2], w.shape[3])).astype(np.int8)
                new_w[:w.shape[0], :w.shape[1], :, :] = w
            else:
                new_size_1 = 1
                new_w = np.zeros((new_size_0, new_size_1, w.shape[2], w.shape[3])).astype(np.int8)
                new_w[:w.shape[0], :w.shape[1], :, :] = w
                new_w = np.squeeze(new_w, axis = 1)
        padded_int_weights.append(new_w)
    padded_int_biases = []
@@ -325,6 +345,15 @@ def concat_inputs_weights(mode_per_layer, padded_input, padded_int_weights, padd
                        comb = combine_values(vector)
                        new_mat[i][j] = comb
        elif(len(dims) == 3):
            new_mat = np.zeros((int(dims[0]//4), dims[1], dims[2]), dtype = np.int64)
            for i in range(int(dims[0]//4)):
                    for j in range(dims[1]):
                        for k in range(dims[2]):
                            vector = layer_weight[4*i : 4*(i+1), j, k]
                            comb = combine_values(vector)
                            new_mat[i][j][k] = comb
        elif(len(dims) == 4):
            if((mode_per_layer[iter] == 0) | (mode_per_layer[iter] == 4)):
                new_mat = np.zeros((int(dims[0]//4), dims[1], dims[2], dims[3]), dtype = np.int64)
@@ -602,9 +631,17 @@ def save_cnn_net_params(path, int_weights, int_biases, mul_vals, shift_vals, shi
            dims = np.shape(int_weights[k])
            mat = int_weights[k]   
            if(len(dims) == 2 or ((len(dims) == 4) and dims[2] == dims[3] == 1)):
                f.write('static const int ')
                if(len(dims) == 2):
                    wi += 1
-                st = 'static const int W' + str(wi) + '[' + str(dims[0]) + ']' + '[' + str(dims[1]) + '] = {\n'
+                    f.write('W' + str(wi))                
                else:
                    mat = np.squeeze(mat, axis = (2,3))
                    fi += 1
                    f.write('F' + str(fi))
                st = '[' + str(dims[0]) + ']' + '[' + str(dims[1]) + '] = {\n'
                f.write(st)
                for n in range(dims[0]):
                    f.write('\t{')
@@ -619,6 +656,32 @@ def save_cnn_net_params(path, int_weights, int_biases, mul_vals, shift_vals, shi
                    f.write('\n')
                f.write('};\n\n')
            elif (len(dims) == 3):
                dims = np.shape(mat)
                fi += 1
                st = 'static const int F' + str(fi) + '[' + str(dims[0]) + '][' + str(dims[1])
                st += '][' + str(dims[2]) + '] = {\n'
                f.write(st)
                for n in range(dims[0]):
                    f.write('\t{\n')
                    for l in range(dims[1]):
                        f.write('\t\t{')
                        for h in range(dims[2] - 1):
                            f.write(str(mat[n][l][h]) + ', ')
                        if dims[2] != 1:
                            f.write(str(mat[n][l][dims[2] - 1]) + '}')
                        else:
                            f.write(str(mat[n][l][0]) + '}')
                        if (l != dims[1] - 1):
                            f.write(',')
                        f.write('\n')
                    f.write('\t}')
                    if n != dims[0] - 1:
                        f.write(',')
                    f.write('\n')
                f.write('};\n\n')
            elif(len(dims) == 4):
                mat = np.transpose(mat, (0, 2, 3, 1))
                dims = np.shape(mat)
@@ -856,9 +919,11 @@ def generate_opt_c_code_mlp(path, name, int_weights, optimal_config, type_of_lay
        f.write('\t' + name + '();\n\n')
        f.write('\treturn 0;\n}')
-def get_cnn_details(model):
+def get_cnn_details(module, details = None):
    if details is None:
        details = []
-    for layer in model.children():
+
    for layer in module.children():
        if isinstance(layer, nn.Conv2d):
            details.append({
                "layer_type": "Conv2d",
@@ -866,10 +931,11 @@ def get_cnn_details(model):
                "out_channels": layer.out_channels,
                "kernel_size": layer.kernel_size,
                "stride": layer.stride,
-                "padding": layer.padding
+                "padding": layer.padding,
                "groups": layer.groups
            })
-        elif (isinstance(layer, nn.MaxPool2d)):
+        elif isinstance(layer, nn.MaxPool2d):
            details.append({
                "layer_type": "MaxPool2d",
                "kernel_size": layer.kernel_size,
@@ -877,7 +943,7 @@ def get_cnn_details(model):
                "padding": layer.padding
            })
-        elif (isinstance(layer, nn.AvgPool2d)):
+        elif isinstance(layer, nn.AvgPool2d):
            details.append({
                "layer_type": "AvgPool2d",
                "kernel_size": layer.kernel_size,
@@ -891,6 +957,10 @@ def get_cnn_details(model):
                "in_features": layer.in_features,
                "out_features": layer.out_features
            })
        # Recursively apply to children modules
        get_cnn_details(layer, details)
    return details
 def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
@@ -900,10 +970,17 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
        f.write('#include "fully_connected.h"\n')
        f.write('#include "ibex_cnn_params.h"\n')
        f.write('#include "ibex_inputs.h"\n')
-        f.write('#include "conv2d.h"\n\n')
+        f.write('#include "conv2d.h"\n')
-        f.write('#define IMG_SZ ' + str(input.shape[2]) + '\n')
+        for detail in cnn_details[:-1]:
-        f.write('#define NUM_FIL0 ' + str(int_weights[0].shape[1]) + '\n\n')
+            if detail["layer_type"] == "Conv2d":
                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
                    f.write('#include "dws_conv.h"\n')
                    break
        f.write('\n')
        f.write('#define IMG_SZ ' + str(np.shape(input)[2]) + '\n')
        f.write('#define NUM_FIL0 ' + str(np.shape(input)[1]) + '\n\n')
        i = 1
        for w in int_weights:
            if(len(np.shape(w)) == 4):
@@ -1050,11 +1127,17 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
        for detail in cnn_details[:-1]:
            if detail["layer_type"] == "Conv2d":
                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
                    conv_type = 'dw_conv'
                elif(detail["kernel_size"][0] == 1):
                    conv_type = 'pw_conv'
                else:
                    conv_type = "conv2"
                if(i == 1):
-                    f.write('\t\tconv2(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
+                    f.write('\t\t' + conv_type + '(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
                    f.write('out1, STRIDE1, pad_1, SB1, MV1, SV1);')
                else:
-                    f.write('\t\tconv2(outp_dim' + str(i-1) + ', f_dim' + str(i) + ', outp_dim' + str(i))
+                    f.write('\t\t' + conv_type + '(outp_dim' + str(i-1) + ', f_dim' + str(i) + ', outp_dim' + str(i))
                    f.write(', out' + str(i-1) + ', F' + str(fi) + ', B' + str(fi) + ', out' + str(i))
                    f.write(', STRIDE' + str(fi) + ', pad_' + str(i) + ', SB' + str(fi))
                    f.write(', MV' + str(fi) + ', SV' + str(fi) + ');')
@@ -1091,6 +1174,13 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
            f.write('\n')
            i += 1
        if flatten == 0:
            f.write('\t\tflatten(outp_dim' + str(i-1) + ', out' + str(i-1) + ', out' + str(i) + ');\n\n')
            i += 1
            f.write('\t\tmlp_layer(out' + str(i-1) + ', out, flatten_dim, OUT_DIM, ')
            f.write('W1, B' + str(fi + dn - 1) +  ', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
            f.write(', SV' + str(fi + dn - 1) + ');')
        else:
            f.write('\t\tmlp_layer(out' + str(i-1) + ', out, DENSE_DIM' + str(dn-1))
            f.write(', OUT_DIM, W' + str(dn) + ', B' + str(fi + dn - 1))
            f.write(', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
@@ -1119,13 +1209,21 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
        f.write('#include "fully_connected_opt.h"\n')
        f.write('#include "ibex_cnn_params.h"\n')
        f.write('#include "ibex_inputs.h"\n')
-        f.write('#include "conv2d_opt.h"\n\n')
+        f.write('#include "conv2d_opt.h"\n')
        for detail in cnn_details[:-1]:
            if detail["layer_type"] == "Conv2d":
                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
                    f.write('#include "dws_conv_opt.h"\n')
                    break
        f.write('\n')
        f.write('#define IMG_SZ ' + str(np.shape(input)[2]) + '\n')
-        f.write('#define NUM_FIL0 ' + str(np.shape(input)[0]) + '\n\n')
+        f.write('#define NUM_FIL0 ' + str(np.shape(input)[1]) + '\n\n')
        i = 1
        for w in int_weights:
-            if(len(np.shape(w)) == 4):
+            if(len(np.shape(w)) == 4 or len(np.shape(w)) == 3):
                f.write('#define FILTER' + str(i) + ' ' + str(w.shape[2]) + '\n')
                i += 1
@@ -1133,7 +1231,7 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
        i = 1
        for w in int_weights:
-            if(len(np.shape(w)) == 4):
+            if(len(np.shape(w)) == 4 or len(np.shape(w)) == 3):
                f.write('#define NUM_FIL' + str(i) + ' ' + str(w.shape[0]) + '\n')
                i += 1
@@ -1270,14 +1368,21 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
        for detail in cnn_details[:-1]:
            if detail["layer_type"] == "Conv2d":
                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
                    conv_type = 'dw_conv_opt'
                elif(detail["kernel_size"][0] == 1):
                    conv_type = 'pw_conv_' + str(optimal_config[j]) + 'bits'
                else:
                    conv_type = 'conv2_' + str(optimal_config[j]) + 'bits'
                if(i == 1):
-                    f.write('\t\tconv2_' + str(optimal_config[j]) + 'bits')
+                    f.write('\t\t' + conv_type)
-                    if(np.shape(input)[0] == 1):
+                    if(np.shape(input)[1] == 1):
                        f.write('_1ch')
                    f.write('(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
                    f.write('out1, STRIDE1, pad_1, SB1, MV1, SV1);')
                else:
-                    f.write('\t\tconv2_' + str(optimal_config[j]) + 'bits(outp_dim' + str(i-1) + ', f_dim' + str(i))
+                    f.write('\t\t' + conv_type + '(outp_dim' + str(i-1) + ', f_dim' + str(i))
                    f.write(', outp_dim' + str(i) + ', out' + str(i-1) + ', F' + str(fi) + ', B' + str(fi) + ', out')
                    f.write(str(i) + ', STRIDE' + str(fi) + ', pad_' + str(i) + ', SB' + str(fi))
                    f.write(', MV' + str(fi) + ', SV' + str(fi) + ');')
@@ -1314,11 +1419,19 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
            f.write('\n')
            i += 1
        if flatten == 0:
            f.write('\t\tflatten(outp_dim' + str(i-1) + ', out' + str(i-1) + ', out' + str(i) + ');\n\n')
            i += 1
            f.write('\t\tmlp_layer_' + str(optimal_config[j]) + 'bits(out' + str(i-1) + ', out, ')
            f.write('flatten_dim, OUT_DIM, W1, B' + str(fi + dn - 1) +  ', SB' + str(fi + dn - 1) + ', MV')
            f.write(str(fi + dn - 1) + ', SV' + str(fi + dn - 1) + ');\n')
        else:
            f.write('\t\tmlp_layer_' + str(optimal_config[-1]) + 'bits(out' + str(i-1) + ', out, DENSE_DIM' + str(dn-1))
            f.write(', OUT_DIM, W' + str(dn) + ', B' + str(fi + dn - 1))
            f.write(', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
            f.write(', SV' + str(fi + dn - 1) + ');\n')
        f.write('\n\t\tpcount_enable(0);\n\n')
        f.write('\t\tputs("Output Layer Values:\\n");\n')
        f.write('\t\tfor(int i = 0; i < OUT_DIM; i++) {\n')
        f.write('\t\t\tputhex((out[i] & 0xFF000000) >> 24);\n')
@@ -11,6 +11,9 @@ from torch import nn, optim
 import brevitas.nn as qnn
 from brevitas.quant import *
 from brevitas.core.restrict_val import RestrictValueType
 from collections import defaultdict
 from torchinfo import summary
 def net_input_size(X_train):
@@ -202,7 +205,21 @@ def generate_sequences(length, values = [2, 4, 8]):
 def create_weight_confs(macc_per_layer):
    total_macc_opt = []
-    weights_per_layer = generate_sequences(len(macc_per_layer))
+    
    cc = 0 
    idx = []
    if(len(macc_per_layer) >= 6):
        for i, mpl in enumerate(macc_per_layer):
            if(mpl/max(macc_per_layer) < 0.05):
                cc += 1
                idx.append(i)
    weights_per_layer = generate_sequences(len(macc_per_layer) - cc)
    for w in weights_per_layer:
        for i in idx:
            w.insert(i, 8)
    for w_conf in weights_per_layer:
        macc = 0
@@ -234,24 +251,47 @@ def create_weight_confs(macc_per_layer):
 # Define a mapping from PyTorch layers to Brevitas layers
 def create_layer_mapping(bit_width):
    mapping = {
-        nn.Conv2d: lambda layer, bw: qnn.QuantConv2d(in_channels = layer.in_channels, 
+        nn.Conv2d: lambda layer, bw: (qnn.QuantConv2d(in_channels=layer.in_channels, 
-                                                    out_channels = layer.out_channels, 
+                                                        out_channels=layer.out_channels, 
-                                                    kernel_size = layer.kernel_size, 
+                                                        kernel_size=layer.kernel_size, 
-                                                    stride = layer.stride[0], 
+                                                        stride=layer.stride[0], 
-                                                    padding = layer.padding,
+                                                        padding=layer.padding,
-                                                    bias = True,
+                                                        groups=layer.groups,
-                                                    cache_inference_bias = True,
+                                                        bias=True,
-                                                    bias_quant = Int32Bias,
+                                                        cache_inference_bias=True,
-                                                    weight_bit_width = bw,
+                                                        bias_quant=Int32Bias,
-                                                    weight_quant = Int8WeightPerTensorFloat),
+                                                        weight_bit_width=bw,
                                                        weight_quant=Int8WeightPerTensorFloat,
                                                        weight_scaling_min_val=2e-16,
                                                        restrict_scaling_type=RestrictValueType.LOG_FP,
                                                        return_quant_tensor=True
                                                        ) if layer.groups != layer.in_channels else (
                                                            # Special case for depthwise convolutions
                                        qnn.QuantConv2d(in_channels=layer.in_channels, 
                                                                out_channels=layer.out_channels, 
                                                                kernel_size=layer.kernel_size, 
                                                                stride=layer.stride[0], 
                                                                padding=layer.padding,
                                                                groups=layer.groups,
                                                                bias=True,
                                                                cache_inference_bias=True,
                                                                bias_quant=Int32Bias,
                                                                weight_bit_width=8,  # Fixed bit width for depthwise convolutions
                                                                weight_quant=Int8WeightPerTensorFloat,
                                                                weight_scaling_min_val=2e-16,
                                                                restrict_scaling_type=RestrictValueType.LOG_FP,
                                                                return_quant_tensor=True))),
        nn.Linear: lambda layer, bw: qnn.QuantLinear(in_features = layer.in_features, 
                                                    out_features = layer.out_features, 
                                                    cache_inference_bias = True,
                                                    weight_quant = Int8WeightPerTensorFloat,
                                                    bias_quant = Int32Bias,
                                                    bias = True,
-                                                    weight_bit_width = bw),
+                                                    
                                                    weight_quant = Int8WeightPerTensorFloat, 
                                                    weight_bit_width = bw,
                                                    return_quant_tensor=True),
        nn.ReLU: lambda _, bw: qnn.QuantReLU(bit_width = bw, 
                                            return_quant_tensor = True),
@@ -278,13 +318,11 @@ def convert_layer(layer, bit_width, layer_mapping):
        return layer
 # Function to convert a PyTorch model to a Brevitas model
-def convert_model(module, bit_widths, layer_mapping):
+def convert_model(module, bit_widths, layer_mapping, layer_idx = [0]):
    layer_idx = [0]
    brevitas_module = nn.Sequential()
    for name, layer in module.named_children():
        if list(layer.children()):  # If the layer has children, recurse
-            brevitas_module.add_module(name, convert_model(layer, bit_widths, layer_mapping))
+            brevitas_module.add_module(name, convert_model(layer, bit_widths, layer_mapping, layer_idx))
        else:
            layer_type = type(layer)
            if layer_type in [nn.Conv2d, nn.Linear]:
@@ -293,6 +331,7 @@ def convert_model(module, bit_widths, layer_mapping):
            else:
                bit_width = 8
            brevitas_module.add_module(name, convert_layer(layer, bit_width, layer_mapping))
    return brevitas_module
 class Quant_Model(nn.Module):
@@ -300,13 +339,15 @@ class Quant_Model(nn.Module):
        super(Quant_Model, self).__init__()
        if(input_sign):
            self.quant_inp = qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True,
-                         act_quant = Uint8ActPerTensorFloat)
+                         act_quant = Uint8ActPerTensorFloat, scaling_min_val = 2e-16, 
                                        restrict_scaling_type = RestrictValueType.LOG_FP)
        else:
            self.quant_inp = qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True,
-                         act_quant = Int8ActPerTensorFloat)
+                         act_quant = Int8ActPerTensorFloat, scaling_min_val = 2e-16, 
                                        restrict_scaling_type = RestrictValueType.LOG_FP)
-        self.sequential = convert_model(og_model, w, layer_mapping)
+        self.sequential = convert_model(og_model, w, layer_mapping, [0])
        self.o_quant =  qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True)
    def forward(self, X):
@@ -315,6 +356,35 @@ class Quant_Model(nn.Module):
        X = self.o_quant(X)
        return F.log_softmax(X, dim = 1)
 def count_layers_in_sequential(module):
    # List to store the counts of Conv2d and Linear layers for each nn.Sequential module
    sequential_counts = []
    def _count_layers(submodule, prefix = ''):
        if isinstance(submodule, nn.Sequential):
            conv_count = 0
            linear_count = 0
            # Count Conv2d and Linear layers in the current nn.Sequential module
            for child in submodule.children():
                if isinstance(child, nn.Conv2d):
                    conv_count += 1
                elif isinstance(child, nn.Linear):
                    linear_count += 1
            # Append the counts to the list
            sequential_counts.append((conv_count, linear_count))
            # Recursively process children of the current nn.Sequential module
            for name, child in submodule.named_children():
                child_prefix = f"{prefix}.{name}" if prefix else name
                _count_layers(child, child_prefix)
        else:
            # Process children of non-nn.Sequential modules
            for name, child in submodule.named_children():
                _count_layers(child, prefix)
    _count_layers(module)
    return sequential_counts[1:]
 def train_quant_model(quant_net, train_loader, val_loader = None, device = 'cpu',
                      epochs = 20, lr = 0.0001):
@@ -392,6 +462,7 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
        device = 'cpu', epochs = 5, lr = 0.0001):
    sign = calculate_minimum(train_loader) >= 0
    seq_counts = count_layers_in_sequential(og_model)
    if max_acc_drop is not None:
        print('\nDSE STARTING ... BINARY SEARCH')
@@ -402,6 +473,16 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
            mid = (low + high) // 2
            w = weights_per_layer[mid]
            f_w = []
            for i in range(len(seq_counts)):
                t_w = w[i]
                c,l = seq_counts[i]
                for j in range(c+l):
                    f_w.append(t_w)
            if(len(seq_counts) > 0):
                w = f_w
            # Create and train the quantized network
            layer_mapping = create_layer_mapping(w)
            quant_net = Quant_Model(og_model, w, layer_mapping, sign)
@@ -436,6 +517,16 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
        print('\nDSE STARTING ... EXHAUSTIVE SEARCH')
        test_accuracy = []
        for i, w in enumerate(weights_per_layer):
            f_w = []
            for i in range(len(seq_counts)):
                t_w = w[i]
                c,l = seq_counts[i]
                for j in range(c+l):
                    f_w.append(t_w)
            if(len(seq_counts) > 0):
                w = f_w
            layer_mapping = create_layer_mapping(w)
            quant_net = Quant_Model(og_model, w, layer_mapping, sign)
            quant_net = quant_net.to(device)
@@ -151,15 +151,154 @@ class Ibex_Lenet5(nn.Module):
        return X
 class Ibex_CMSIS_CNN(nn.Module):
    def __init__(self, mul_vals, shift_vals):
        super(Ibex_CMSIS_CNN, self).__init__()
        self.m0 = mul_vals[0]
        self.m1 = mul_vals[1]
        self.m2 = mul_vals[2]
        self.m3 = mul_vals[3]
        self.s0 = shift_vals[0] + 7
        self.s1 = shift_vals[1] + 7
        self.s2 = shift_vals[2] + 7
        self.s3 = shift_vals[3] + 7
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 5, padding = 'same')
        self.max1 = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 5, padding = 'same')
        self.max2 = nn.MaxPool2d(2,2)
        self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, padding = 'same')
        self.max3 = nn.MaxPool2d(2,2)
        self.linear1 = nn.Linear(1024, 10)
    def forward(self, X, print_out = False):
        X = self.conv1(X)
        X = torch.mul(X, self.m0)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s0 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s0).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        X = self.max1(X)
        X = self.conv2(X)
        X = torch.mul(X, self.m1)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s1 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s1).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        X = self.max2(X)
        X = self.conv3(X)
        X = torch.mul(X, self.m2)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s2 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s2).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        X = self.max3(X)
        X = X.reshape(X.shape[0], -1)
        X = self.linear1(X)
        X = torch.mul(X, self.m3)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s3 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s3).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        if(print_out):
            print(X)
        return X
 class Ibex_DepthwiseBlock(nn.Module):
    def __init__(self, in_channels, out_channels, mul_vals, shift_vals):
        super(Ibex_DepthwiseBlock, self).__init__()
        self.dw = nn.Conv2d(in_channels = in_channels, out_channels = in_channels, 
                                    kernel_size = 3, padding = 1, groups = in_channels)
        self.pw = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, 
                                    kernel_size = 1, padding = 0)
        self.m0 = mul_vals[0]
        self.m1 = mul_vals[1]
        self.s0 = shift_vals[0] + 7
        self.s1 = shift_vals[1] + 7
    def forward(self, X):
        X = self.dw(X)
        X = torch.mul(X, self.m0)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s0 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s0).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        X = self.pw(X)
        X = torch.mul(X, self.m1)
        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s1 -1)).type(torch.LongTensor)
        X = torch.bitwise_right_shift(X, self.s1).type(torch.FloatTensor)
        X = torch.clamp(X, min = 0, max = 255)
        return X
 class Ibex_Cifar10_Dws_CNN(nn.Module):
    def __init__(self, mul_vals, shift_vals):
        super(Ibex_Cifar10_Dws_CNN, self).__init__()
        self.features = nn.Sequential(
            Ibex_DepthwiseBlock(3, 64, mul_vals[0:2], shift_vals[0:2]),
            Ibex_DepthwiseBlock(64, 64, mul_vals[2:4], shift_vals[2:4]),
            nn.MaxPool2d(kernel_size = 2, stride = 2),
            Ibex_DepthwiseBlock(64, 128, mul_vals[4:6], shift_vals[4:6]),
            Ibex_DepthwiseBlock(128, 128, mul_vals[6:8], shift_vals[6:8]),
            nn.MaxPool2d(kernel_size = 2, stride = 2),
            Ibex_DepthwiseBlock(128, 256, mul_vals[8:10], shift_vals[8:10]),
            Ibex_DepthwiseBlock(256, 256, mul_vals[10:12], shift_vals[10:12]),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(256 * 4 * 4, 10)
        )
        self.m_cl = mul_vals[12]
        self.s_cl = shift_vals[12] + 7
    def forward(self, x, print_out = False):
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        x = torch.mul(x, self.m_cl)
        x = torch.add(x, torch.bitwise_left_shift(torch.tensor(1), self.s_cl - 1)).type(torch.LongTensor)
        x = torch.bitwise_right_shift(x, self.s_cl).type(torch.FloatTensor)
        x = torch.clamp(x, min = 0, max = 255)
        if(print_out):
            print(x)
        return x
 def configure_network(ibex_model_dict, int_weights, int_biases):
    for i, (name, _) in enumerate(ibex_model_dict.items()):
        if(i%2 == 0):
            ibex_model_dict[name] =  torch.tensor(int_weights[i//2])
        else:
            ibex_model_dict[name] = torch.tensor(int_biases[i//2])
    return ibex_model_dict
 def create_fann_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_FANN(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()
-    ibex_model_dict['linear1.weight'] = torch.tensor(int_weights[0])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    ibex_model_dict['linear2.weight'] = torch.tensor(int_weights[1])
    ibex_model_dict['linear1.bias'] = torch.tensor(int_biases[0])
    ibex_model_dict['linear2.bias'] = torch.tensor(int_biases[1])
    ibex_model.load_state_dict(ibex_model_dict)
    return ibex_model
@@ -168,15 +307,7 @@ def create_uci_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_UCI_MLP(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()
-    ibex_model_dict['fc0.weight'] = torch.tensor(int_weights[0])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    ibex_model_dict['fc1.weight'] = torch.tensor(int_weights[1])
    ibex_model_dict['fc2.weight'] = torch.tensor(int_weights[2])
    ibex_model_dict['fc3.weight'] = torch.tensor(int_weights[3])
    ibex_model_dict['fc0.bias'] = torch.tensor(int_biases[0])
    ibex_model_dict['fc1.bias'] = torch.tensor(int_biases[1])
    ibex_model_dict['fc2.bias'] = torch.tensor(int_biases[2])
    ibex_model_dict['fc3.bias'] = torch.tensor(int_biases[3])
    ibex_model.load_state_dict(ibex_model_dict)
@@ -186,24 +317,33 @@ def create_lenet_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_Lenet5(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()
-    ibex_model_dict['conv1.weight'] = torch.tensor(int_weights[0])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    ibex_model_dict['conv2.weight'] = torch.tensor(int_weights[1])
    ibex_model_dict['fc1.weight'] = torch.tensor(int_weights[2])
    ibex_model_dict['fc2.weight'] = torch.tensor(int_weights[3])
    ibex_model_dict['fc3.weight'] = torch.tensor(int_weights[4])
-    ibex_model_dict['conv1.bias'] = torch.tensor(int_biases[0])
+    ibex_model.load_state_dict(ibex_model_dict)
-    ibex_model_dict['conv2.bias'] = torch.tensor(int_biases[1])
+
-    ibex_model_dict['fc1.bias'] = torch.tensor(int_biases[2])
+    return ibex_model
-    ibex_model_dict['fc2.bias'] = torch.tensor(int_biases[3])
+
-    ibex_model_dict['fc3.bias'] = torch.tensor(int_biases[4])
+def create_cmsis_cnn_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_CMSIS_CNN(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()
    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    ibex_model.load_state_dict(ibex_model_dict)
    return ibex_model
 def create_ibex_dws_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_Cifar10_Dws_CNN(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()
    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    ibex_model.load_state_dict(ibex_model_dict)
    return ibex_model
 def eval_sim_model(quant_model, ibex_model, test_loader):
    # Turn off gradients for validation
    with torch.no_grad():
        ibex_model.eval()
        correct = 0