Adding new features

2024-07-23 13:00:49 +03:00
parent 9e044fd7fc
commit 745cc4ed6d
28 changed files with 33632 additions and 106 deletions
@@ -0,0 +1,15 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Generate a baremetal application
+
+# Name of the program $(PROGRAM).c will be added as a source file
+
+PROGRAM = cifar10_dws_cnn
+PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+# Any extra source files to include in the build. Use the upper case .S
+# extension for assembly files
+EXTRA_SRCS :=
+
+include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,298 @@
+#include "simple_system_common.h"
+#include "cnn_weights.h"
+#include "fully_connected_opt.h"
+#include "ibex_cnn_params.h"
+#include "ibex_inputs.h"
+#include "conv2d_opt.h"
+#include "dws_conv_opt.h"
+
+#define IMG_SZ 32
+#define NUM_FIL0 1
+
+#define FILTER1 3
+#define FILTER2 1
+#define FILTER3 3
+#define FILTER4 1
+#define FILTER5 3
+#define FILTER6 1
+#define FILTER7 3
+#define FILTER8 1
+#define FILTER9 3
+#define FILTER10 1
+#define FILTER11 3
+#define FILTER12 1
+
+#define NUM_FIL1 1
+#define NUM_FIL2 16
+#define NUM_FIL3 16
+#define NUM_FIL4 16
+#define NUM_FIL5 16
+#define NUM_FIL6 32
+#define NUM_FIL7 32
+#define NUM_FIL8 32
+#define NUM_FIL9 32
+#define NUM_FIL10 64
+#define NUM_FIL11 64
+#define NUM_FIL12 64
+
+#define STRIDE1 1
+#define STRIDE2 1
+#define STRIDE3 1
+#define STRIDE4 1
+#define STRIDE5 1
+#define STRIDE6 1
+#define STRIDE7 1
+#define STRIDE8 1
+#define STRIDE9 1
+#define STRIDE10 1
+#define STRIDE11 1
+#define STRIDE12 1
+
+#define PAD_TB1 1
+#define PAD_LR1 1
+
+#define PAD_TB2 0
+#define PAD_LR2 0
+
+#define PAD_TB3 1
+#define PAD_LR3 1
+
+#define PAD_TB4 0
+#define PAD_LR4 0
+
+#define PAD_TB5 1
+#define PAD_LR5 1
+
+#define PAD_TB6 0
+#define PAD_LR6 0
+
+#define PAD_TB7 1
+#define PAD_LR7 1
+
+#define PAD_TB8 0
+#define PAD_LR8 0
+
+#define PAD_TB9 1
+#define PAD_LR9 1
+
+#define PAD_TB10 0
+#define PAD_LR10 0
+
+#define PAD_TB11 1
+#define PAD_LR11 1
+
+#define PAD_TB12 0
+#define PAD_LR12 0
+
+#define POOL_STRIDE1 2
+#define POOL_SIZE1 2
+
+#define POOL_STRIDE2 2
+#define POOL_SIZE2 2
+
+#define POOL_STRIDE3 2
+#define POOL_SIZE3 2
+
+#define OUT_DIM 3
+
+#define SAMPLES 1
+int outs[SAMPLES][OUT_DIM];
+
+void cifar10_dws_cnn() {
+
+	int dout1 = NUM_FIL1;
+	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
+	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
+
+	int dout2 = NUM_FIL2;
+	int hout2 = ((hout1 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
+	int wout2 = ((wout1 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
+
+	int dout3 = NUM_FIL3;
+	int hout3 = ((hout2 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
+	int wout3 = ((wout2 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
+
+	int dout4 = NUM_FIL4;
+	int hout4 = ((hout3 - FILTER4+ 2 * PAD_TB4)/STRIDE4)+1;
+	int wout4 = ((wout3 - FILTER4+ 2 * PAD_LR4)/STRIDE4)+1;
+
+	int dout5 = dout4;
+	int hout5 = hout4/POOL_STRIDE1;
+	int wout5 = wout4/POOL_STRIDE1;
+
+	int dout6 = NUM_FIL5;
+	int hout6 = ((hout5 - FILTER5+ 2 * PAD_TB5)/STRIDE5)+1;
+	int wout6 = ((wout5 - FILTER5+ 2 * PAD_LR5)/STRIDE5)+1;
+
+	int dout7 = NUM_FIL6;
+	int hout7 = ((hout6 - FILTER6+ 2 * PAD_TB6)/STRIDE6)+1;
+	int wout7 = ((wout6 - FILTER6+ 2 * PAD_LR6)/STRIDE6)+1;
+
+	int dout8 = NUM_FIL7;
+	int hout8 = ((hout7 - FILTER7+ 2 * PAD_TB7)/STRIDE7)+1;
+	int wout8 = ((wout7 - FILTER7+ 2 * PAD_LR7)/STRIDE7)+1;
+
+	int dout9 = NUM_FIL8;
+	int hout9 = ((hout8 - FILTER8+ 2 * PAD_TB8)/STRIDE8)+1;
+	int wout9 = ((wout8 - FILTER8+ 2 * PAD_LR8)/STRIDE8)+1;
+
+	int dout10 = dout9;
+	int hout10 = hout9/POOL_STRIDE2;
+	int wout10 = wout9/POOL_STRIDE2;
+
+	int dout11 = NUM_FIL9;
+	int hout11 = ((hout10 - FILTER9+ 2 * PAD_TB9)/STRIDE9)+1;
+	int wout11 = ((wout10 - FILTER9+ 2 * PAD_LR9)/STRIDE9)+1;
+
+	int dout12 = NUM_FIL10;
+	int hout12 = ((hout11 - FILTER10+ 2 * PAD_TB10)/STRIDE10)+1;
+	int wout12 = ((wout11 - FILTER10+ 2 * PAD_LR10)/STRIDE10)+1;
+
+	int dout13 = NUM_FIL11;
+	int hout13 = ((hout12 - FILTER11+ 2 * PAD_TB11)/STRIDE11)+1;
+	int wout13 = ((wout12 - FILTER11+ 2 * PAD_LR11)/STRIDE11)+1;
+
+	int dout14 = NUM_FIL12;
+	int hout14 = ((hout13 - FILTER12+ 2 * PAD_TB12)/STRIDE12)+1;
+	int wout14 = ((wout13 - FILTER12+ 2 * PAD_LR12)/STRIDE12)+1;
+
+	int dout15 = dout14;
+	int hout15 = hout14/POOL_STRIDE3;
+	int wout15 = wout14/POOL_STRIDE3;
+
+	int flatten_dim = dout15 * hout15 * wout15;
+
+	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
+	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
+
+	int out1[hout1][wout1][dout1];
+	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
+	int outp_dim1[3] = {hout1, wout1, dout1};
+	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
+
+	int out2[hout2][wout2][dout2];
+	int pad_2[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
+	int outp_dim2[3] = {hout2, wout2, dout2};
+	int f_dim2[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
+
+	int out3[hout3][wout3][dout3];
+	int pad_3[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
+	int outp_dim3[3] = {hout3, wout3, dout3};
+	int f_dim3[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
+
+	int out4[hout4][wout4][dout4];
+	int pad_4[4] = {PAD_TB4, PAD_TB4, PAD_LR4, PAD_LR4};
+	int outp_dim4[3] = {hout4, wout4, dout4};
+	int f_dim4[4] = {NUM_FIL4, FILTER4, FILTER4, NUM_FIL3};
+
+	int out5[hout5][wout5][dout5];
+	int outp_dim5[3] = {hout5, wout5, dout5};
+
+	int out6[hout6][wout6][dout6];
+	int pad_6[4] = {PAD_TB5, PAD_TB5, PAD_LR5, PAD_LR5};
+	int outp_dim6[3] = {hout6, wout6, dout6};
+	int f_dim6[4] = {NUM_FIL5, FILTER5, FILTER5, NUM_FIL4};
+
+	int out7[hout7][wout7][dout7];
+	int pad_7[4] = {PAD_TB6, PAD_TB6, PAD_LR6, PAD_LR6};
+	int outp_dim7[3] = {hout7, wout7, dout7};
+	int f_dim7[4] = {NUM_FIL6, FILTER6, FILTER6, NUM_FIL5};
+
+	int out8[hout8][wout8][dout8];
+	int pad_8[4] = {PAD_TB7, PAD_TB7, PAD_LR7, PAD_LR7};
+	int outp_dim8[3] = {hout8, wout8, dout8};
+	int f_dim8[4] = {NUM_FIL7, FILTER7, FILTER7, NUM_FIL6};
+
+	int out9[hout9][wout9][dout9];
+	int pad_9[4] = {PAD_TB8, PAD_TB8, PAD_LR8, PAD_LR8};
+	int outp_dim9[3] = {hout9, wout9, dout9};
+	int f_dim9[4] = {NUM_FIL8, FILTER8, FILTER8, NUM_FIL7};
+
+	int out10[hout10][wout10][dout10];
+	int outp_dim10[3] = {hout10, wout10, dout10};
+
+	int out11[hout11][wout11][dout11];
+	int pad_11[4] = {PAD_TB9, PAD_TB9, PAD_LR9, PAD_LR9};
+	int outp_dim11[3] = {hout11, wout11, dout11};
+	int f_dim11[4] = {NUM_FIL9, FILTER9, FILTER9, NUM_FIL8};
+
+	int out12[hout12][wout12][dout12];
+	int pad_12[4] = {PAD_TB10, PAD_TB10, PAD_LR10, PAD_LR10};
+	int outp_dim12[3] = {hout12, wout12, dout12};
+	int f_dim12[4] = {NUM_FIL10, FILTER10, FILTER10, NUM_FIL9};
+
+	int out13[hout13][wout13][dout13];
+	int pad_13[4] = {PAD_TB11, PAD_TB11, PAD_LR11, PAD_LR11};
+	int outp_dim13[3] = {hout13, wout13, dout13};
+	int f_dim13[4] = {NUM_FIL11, FILTER11, FILTER11, NUM_FIL10};
+
+	int out14[hout14][wout14][dout14];
+	int pad_14[4] = {PAD_TB12, PAD_TB12, PAD_LR12, PAD_LR12};
+	int outp_dim14[3] = {hout14, wout14, dout14};
+	int f_dim14[4] = {NUM_FIL12, FILTER12, FILTER12, NUM_FIL11};
+
+	int out15[hout15][wout15][dout15];
+	int outp_dim15[3] = {hout15, wout15, dout15};
+
+	int out16[flatten_dim];
+
+	int out[OUT_DIM];
+
+	for (int iter = 0; iter < SAMPLES; iter++){
+
+		for(int i = 0; i < IMG_SZ; i++){
+			for(int j = 0; j < IMG_SZ; j++){
+				for(int k = 0; k < NUM_FIL0; k++){
+					in[i][j][k] = input[i][j][k][iter];
+				}
+			}
+		}
+
+		pcount_enable(1);
+
+		dw_conv_opt_1ch(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
+		pw_conv_2bits(outp_dim1, f_dim2, outp_dim2, out1, F2, B2, out2, STRIDE2, pad_2, SB2, MV2, SV2);
+		dw_conv_opt(outp_dim2, f_dim3, outp_dim3, out2, F3, B3, out3, STRIDE3, pad_3, SB3, MV3, SV3);
+		pw_conv_8bits(outp_dim3, f_dim4, outp_dim4, out3, F4, B4, out4, STRIDE4, pad_4, SB4, MV4, SV4);
+		maxpool2_compressed(outp_dim4, outp_dim5, out4, out5, POOL_SIZE1, POOL_STRIDE1);
+
+		dw_conv_opt(outp_dim5, f_dim6, outp_dim6, out5, F5, B5, out6, STRIDE5, pad_6, SB5, MV5, SV5);
+		pw_conv_2bits(outp_dim6, f_dim7, outp_dim7, out6, F6, B6, out7, STRIDE6, pad_7, SB6, MV6, SV6);
+		dw_conv_opt(outp_dim7, f_dim8, outp_dim8, out7, F7, B7, out8, STRIDE7, pad_8, SB7, MV7, SV7);
+		pw_conv_8bits(outp_dim8, f_dim9, outp_dim9, out8, F8, B8, out9, STRIDE8, pad_9, SB8, MV8, SV8);
+		maxpool2_compressed(outp_dim9, outp_dim10, out9, out10, POOL_SIZE2, POOL_STRIDE2);
+
+		dw_conv_opt(outp_dim10, f_dim11, outp_dim11, out10, F9, B9, out11, STRIDE9, pad_11, SB9, MV9, SV9);
+		pw_conv_8bits(outp_dim11, f_dim12, outp_dim12, out11, F10, B10, out12, STRIDE10, pad_12, SB10, MV10, SV10);
+		dw_conv_opt(outp_dim12, f_dim13, outp_dim13, out12, F11, B11, out13, STRIDE11, pad_13, SB11, MV11, SV11);
+		pw_conv_8bits(outp_dim13, f_dim14, outp_dim14, out13, F12, B12, out14, STRIDE12, pad_14, SB12, MV12, SV12);
+		maxpool2_compressed(outp_dim14, outp_dim15, out14, out15, POOL_SIZE3, POOL_STRIDE3);
+
+		flatten(outp_dim15, out15, out16);
+
+		mlp_layer_8bits(out16, out, flatten_dim, OUT_DIM, W1, B13, SB13, MV13, SV13);
+
+		pcount_enable(0);
+
+		puts("Output Layer Values:\n");
+		for(int i = 0; i < OUT_DIM; i++) {
+			puthex((out[i] & 0xFF000000) >> 24);
+			puts(" ");
+			puthex((out[i] & 0xFF0000) >> 16);
+			puts(" ");
+			puthex((out[i] & 0xFF00) >> 8);
+			puts(" ");
+			puthex(out[i] & 0xFF);
+			puts("\n");
+		}
+	}
+}
+
+int main(void) {
+
+	pcount_enable(0);
+
+	cifar10_dws_cnn();
+
+	return 0;
+}
@@ -0,0 +1,84 @@
+#ifndef IBEX_CNN_PARAMS_H
+#define IBEX_CNN_PARAMS_H
+
+#define MV1 1263225675
+#define MV2 1886417008
+#define MV3 1381126738
+#define MV4 1263225675
+#define MV5 1465341783
+#define MV6 1280068684
+#define MV7 1869573999
+#define MV8 1600085855
+#define MV9 1600085855
+#define MV10 1970632053
+#define MV11 1145324612
+#define MV12 1532713819
+#define MV13 1296911693
+
+#define SV1 2029118401
+#define SV2 946921921
+#define SV3 2029118401
+#define SV4 1893843841
+#define SV5 1893843841
+#define SV6 1082196481
+#define SV7 2029118401
+#define SV8 2029118401
+#define SV9 2029118401
+#define SV10 2164392961
+#define SV11 2029118401
+#define SV12 2029118401
+#define SV13 2840765761
+
+static const int SB1[1] = {
+	1
+};
+
+static const int SB2[16] = {
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static const int SB3[16] = {
+	135266305, 1048577, 1, 8257, 8193, 135274497, 135266369, 8193, 1, 8193, 65, 1, 134217729, 1, 1, 134225921
+};
+
+static const int SB4[16] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1048576, 0, 0, 0, 0
+};
+
+static const int SB5[16] = {
+	134234177, 136323073, 135282689, 136331393, 270549121, 136331329, 136331329, 136323201, 270540929, 270549121, 270540801, 270532737, 2105473, 8321, 2105345, 2113601
+};
+
+static const int SB6[32] = {
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static const int SB7[32] = {
+	402669825, 540041217, 537927937, 4194561, 272638209, 537952513, 540049665, 541098049, 404783361, 405831873, 2113793, 536879361, 403726593, 540049665, 271614209, 541089921, 272662721, 271614209, 406880513, 541081793, 272662785, 538992897, 272662785, 403726593, 540033281, 540049601, 1081537, 403726337, 269517057, 272646401, 3178625, 539001089
+};
+
+static const int SB8[32] = {
+	270565504, 536895744, 406880512, 541090048, 268435712, 406864128, 540049408, 541065216, 406872320, 541090048, 540049600, 405823552, 540041216, 4227264, 540049664, 271589632, 537952320, 4219008, 540033216, 540041408, 541090048, 540049600, 405823552, 405823680, 405823744, 541081856, 406880448, 402677888, 271606016, 138445056, 403726400, 405831680
+};
+
+static const int SB9[32] = {
+	677380417, 542146817, 806404417, 677421249, 677429569, 810598721, 677421185, 677429441, 408977665, 675332353, 536903937, 675283329, 675307905, 677429505, 811639105, 811639169, 809549953, 407945601, 676380929, 676380993, 810582273, 810598721, 677429569, 675299649, 541106433, 811630785, 675316097, 405848449, 811630913, 811630977, 806404225, 677421441
+};
+
+static const int SB10[64] = {
+	139501824, 676364608, 673227072, 810582336, 405840256, 408969536, 541114624, 810590528, 810557760, 675307776, 676331840, 811630848, 408994112, 676381056, 810598720, 537952576, 541114432, 674267392, 542162944, 677429568, 408985920, 677429504, 542155136, 676372864, 811639104, 407937344, 542146880, 811630976, 406896832, 675332416, 675316096, 674275712, 677421120, 810590528, 540066112, 408969536, 811647232, 407920960, 273727616, 677421312, 810582272, 676340096, 6332736, 671138176, 677421376, 677429568, 676372544, 540066176, 676372800, 536912192, 406872384, 676372800, 805347712, 810590464, 5284160, 274776448, 677413248, 541089984, 674283520, 541106560, 810598720, 137412992, 810598528, 811639168
+};
+
+static const int SB11[64] = {
+	810607041, 678486337, 945865089, 810607041, 673227201, 939565505, 946913729, 943767937, 946913729, 811647425, 944816449, 678478273, 811647361, 678478209, 812695937, 678453697, 943776065, 810598849, 944800129, 677437825, 678486401, 946913665, 946921793, 541114753, 945873345, 542163265, 544260417, 544244033, 939548801, 945865025, 678478145, 944824577, 812704129, 5300673, 946889089, 676389057, 941679041, 675340609, 809558465, 273735937, 678461889, 678478145, 812695873, 676381121, 678486465, 671138113, 810557825, 945856961, 944775489, 946921665, 946897345, 809533889, 812695937, 812687809, 812696001, 945865089, 676389249, 677413249, 945840449, 946913473, 943767937, 675332353, 676381121, 811647425
+};
+
+static const int SB12[64] = {
+	810598784, 811647296, 677404992, 809550144, 677429632, 811647296, 810582400, 675332480, 676381056, 810598720, 542163200, 543211840, 809533440, 673235328, 807444672, 675316096, 810582016, 541114560, 677396800, 810590528, 676381056, 138453376, 809550208, 810598784, 676372800, 810598784, 675332352, 542163328, 674242944, 677421440, 404799808, 542163328, 809542016, 809542016, 810598784, 139501952, 674283712, 541114752, 811622784, 676372672, 542155136, 543211904, 811639168, 811630912, 809542016, 676356480, 673218944, 811630976, 810598720, 810582208, 675307584, 810598784, 543203648, 542163264, 677404672, 811630784, 810590592, 810582400, 674275712, 810590528, 541098304, 675332416, 539001088, 811622784
+};
+
+static const int SB13[3] = {
+	273736128, 946913728, 675282944
+};
+
+#endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Generate a baremetal application
+
+# Name of the program $(PROGRAM).c will be added as a source file
+
+PROGRAM = cifar10_dws_cnn
+PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+# Any extra source files to include in the build. Use the upper case .S
+# extension for assembly files
+EXTRA_SRCS :=
+
+include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,292 @@
+#include "simple_system_common.h"
+#include "cnn_weights.h"
+#include "fully_connected.h"
+#include "ibex_cnn_params.h"
+#include "ibex_inputs.h"
+#include "conv2d.h"
+#include "dws_conv.h"
+
+#define IMG_SZ 32
+#define NUM_FIL0 3
+
+#define FILTER1 3
+#define FILTER2 1
+#define FILTER3 3
+#define FILTER4 1
+#define FILTER5 3
+#define FILTER6 1
+#define FILTER7 3
+#define FILTER8 1
+#define FILTER9 3
+#define FILTER10 1
+#define FILTER11 3
+#define FILTER12 1
+
+#define NUM_FIL1 3
+#define NUM_FIL2 64
+#define NUM_FIL3 64
+#define NUM_FIL4 64
+#define NUM_FIL5 64
+#define NUM_FIL6 128
+#define NUM_FIL7 128
+#define NUM_FIL8 128
+#define NUM_FIL9 128
+#define NUM_FIL10 256
+#define NUM_FIL11 256
+#define NUM_FIL12 256
+
+#define STRIDE1 1
+#define STRIDE2 1
+#define STRIDE3 1
+#define STRIDE4 1
+#define STRIDE5 1
+#define STRIDE6 1
+#define STRIDE7 1
+#define STRIDE8 1
+#define STRIDE9 1
+#define STRIDE10 1
+#define STRIDE11 1
+#define STRIDE12 1
+
+#define PAD_TB1 1
+#define PAD_LR1 1
+
+#define PAD_TB2 0
+#define PAD_LR2 0
+
+#define PAD_TB3 1
+#define PAD_LR3 1
+
+#define PAD_TB4 0
+#define PAD_LR4 0
+
+#define PAD_TB5 1
+#define PAD_LR5 1
+
+#define PAD_TB6 0
+#define PAD_LR6 0
+
+#define PAD_TB7 1
+#define PAD_LR7 1
+
+#define PAD_TB8 0
+#define PAD_LR8 0
+
+#define PAD_TB9 1
+#define PAD_LR9 1
+
+#define PAD_TB10 0
+#define PAD_LR10 0
+
+#define PAD_TB11 1
+#define PAD_LR11 1
+
+#define PAD_TB12 0
+#define PAD_LR12 0
+
+#define POOL_STRIDE1 2
+#define POOL_SIZE1 2
+
+#define POOL_STRIDE2 2
+#define POOL_SIZE2 2
+
+#define POOL_STRIDE3 2
+#define POOL_SIZE3 2
+
+#define OUT_DIM 10
+
+#define SAMPLES 1
+int outs[SAMPLES][OUT_DIM];
+
+void cifar10_dws_cnn() {
+
+	int dout1 = NUM_FIL1;
+	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
+	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
+
+	int dout2 = NUM_FIL2;
+	int hout2 = ((hout1 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
+	int wout2 = ((wout1 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
+
+	int dout3 = NUM_FIL3;
+	int hout3 = ((hout2 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
+	int wout3 = ((wout2 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
+
+	int dout4 = NUM_FIL4;
+	int hout4 = ((hout3 - FILTER4+ 2 * PAD_TB4)/STRIDE4)+1;
+	int wout4 = ((wout3 - FILTER4+ 2 * PAD_LR4)/STRIDE4)+1;
+
+	int dout5 = dout4;
+	int hout5 = hout4/POOL_STRIDE1;
+	int wout5 = wout4/POOL_STRIDE1;
+
+	int dout6 = NUM_FIL5;
+	int hout6 = ((hout5 - FILTER5+ 2 * PAD_TB5)/STRIDE5)+1;
+	int wout6 = ((wout5 - FILTER5+ 2 * PAD_LR5)/STRIDE5)+1;
+
+	int dout7 = NUM_FIL6;
+	int hout7 = ((hout6 - FILTER6+ 2 * PAD_TB6)/STRIDE6)+1;
+	int wout7 = ((wout6 - FILTER6+ 2 * PAD_LR6)/STRIDE6)+1;
+
+	int dout8 = NUM_FIL7;
+	int hout8 = ((hout7 - FILTER7+ 2 * PAD_TB7)/STRIDE7)+1;
+	int wout8 = ((wout7 - FILTER7+ 2 * PAD_LR7)/STRIDE7)+1;
+
+	int dout9 = NUM_FIL8;
+	int hout9 = ((hout8 - FILTER8+ 2 * PAD_TB8)/STRIDE8)+1;
+	int wout9 = ((wout8 - FILTER8+ 2 * PAD_LR8)/STRIDE8)+1;
+
+	int dout10 = dout9;
+	int hout10 = hout9/POOL_STRIDE2;
+	int wout10 = wout9/POOL_STRIDE2;
+
+	int dout11 = NUM_FIL9;
+	int hout11 = ((hout10 - FILTER9+ 2 * PAD_TB9)/STRIDE9)+1;
+	int wout11 = ((wout10 - FILTER9+ 2 * PAD_LR9)/STRIDE9)+1;
+
+	int dout12 = NUM_FIL10;
+	int hout12 = ((hout11 - FILTER10+ 2 * PAD_TB10)/STRIDE10)+1;
+	int wout12 = ((wout11 - FILTER10+ 2 * PAD_LR10)/STRIDE10)+1;
+
+	int dout13 = NUM_FIL11;
+	int hout13 = ((hout12 - FILTER11+ 2 * PAD_TB11)/STRIDE11)+1;
+	int wout13 = ((wout12 - FILTER11+ 2 * PAD_LR11)/STRIDE11)+1;
+
+	int dout14 = NUM_FIL12;
+	int hout14 = ((hout13 - FILTER12+ 2 * PAD_TB12)/STRIDE12)+1;
+	int wout14 = ((wout13 - FILTER12+ 2 * PAD_LR12)/STRIDE12)+1;
+
+	int dout15 = dout14;
+	int hout15 = hout14/POOL_STRIDE3;
+	int wout15 = wout14/POOL_STRIDE3;
+
+	int flatten_dim = dout15 * hout15 * wout15;
+
+	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
+	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
+
+	int out1[hout1][wout1][dout1];
+	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
+	int outp_dim1[3] = {hout1, wout1, dout1};
+	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
+
+	int out2[hout2][wout2][dout2];
+	int pad_2[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
+	int outp_dim2[3] = {hout2, wout2, dout2};
+	int f_dim2[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
+
+	int out3[hout3][wout3][dout3];
+	int pad_3[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
+	int outp_dim3[3] = {hout3, wout3, dout3};
+	int f_dim3[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
+
+	int out4[hout4][wout4][dout4];
+	int pad_4[4] = {PAD_TB4, PAD_TB4, PAD_LR4, PAD_LR4};
+	int outp_dim4[3] = {hout4, wout4, dout4};
+	int f_dim4[4] = {NUM_FIL4, FILTER4, FILTER4, NUM_FIL3};
+
+	int out5[hout5][wout5][dout5];
+	int outp_dim5[3] = {hout5, wout5, dout5};
+
+	int out6[hout6][wout6][dout6];
+	int pad_6[4] = {PAD_TB5, PAD_TB5, PAD_LR5, PAD_LR5};
+	int outp_dim6[3] = {hout6, wout6, dout6};
+	int f_dim6[4] = {NUM_FIL5, FILTER5, FILTER5, NUM_FIL4};
+
+	int out7[hout7][wout7][dout7];
+	int pad_7[4] = {PAD_TB6, PAD_TB6, PAD_LR6, PAD_LR6};
+	int outp_dim7[3] = {hout7, wout7, dout7};
+	int f_dim7[4] = {NUM_FIL6, FILTER6, FILTER6, NUM_FIL5};
+
+	int out8[hout8][wout8][dout8];
+	int pad_8[4] = {PAD_TB7, PAD_TB7, PAD_LR7, PAD_LR7};
+	int outp_dim8[3] = {hout8, wout8, dout8};
+	int f_dim8[4] = {NUM_FIL7, FILTER7, FILTER7, NUM_FIL6};
+
+	int out9[hout9][wout9][dout9];
+	int pad_9[4] = {PAD_TB8, PAD_TB8, PAD_LR8, PAD_LR8};
+	int outp_dim9[3] = {hout9, wout9, dout9};
+	int f_dim9[4] = {NUM_FIL8, FILTER8, FILTER8, NUM_FIL7};
+
+	int out10[hout10][wout10][dout10];
+	int outp_dim10[3] = {hout10, wout10, dout10};
+
+	int out11[hout11][wout11][dout11];
+	int pad_11[4] = {PAD_TB9, PAD_TB9, PAD_LR9, PAD_LR9};
+	int outp_dim11[3] = {hout11, wout11, dout11};
+	int f_dim11[4] = {NUM_FIL9, FILTER9, FILTER9, NUM_FIL8};
+
+	int out12[hout12][wout12][dout12];
+	int pad_12[4] = {PAD_TB10, PAD_TB10, PAD_LR10, PAD_LR10};
+	int outp_dim12[3] = {hout12, wout12, dout12};
+	int f_dim12[4] = {NUM_FIL10, FILTER10, FILTER10, NUM_FIL9};
+
+	int out13[hout13][wout13][dout13];
+	int pad_13[4] = {PAD_TB11, PAD_TB11, PAD_LR11, PAD_LR11};
+	int outp_dim13[3] = {hout13, wout13, dout13};
+	int f_dim13[4] = {NUM_FIL11, FILTER11, FILTER11, NUM_FIL10};
+
+	int out14[hout14][wout14][dout14];
+	int pad_14[4] = {PAD_TB12, PAD_TB12, PAD_LR12, PAD_LR12};
+	int outp_dim14[3] = {hout14, wout14, dout14};
+	int f_dim14[4] = {NUM_FIL12, FILTER12, FILTER12, NUM_FIL11};
+
+	int out15[hout15][wout15][dout15];
+	int outp_dim15[3] = {hout15, wout15, dout15};
+
+	int out16[flatten_dim];
+
+
+	int out[OUT_DIM];
+
+	for (int iter = 0; iter < SAMPLES; iter++){
+
+		for(int i = 0; i < IMG_SZ; i++){
+			for(int j = 0; j < IMG_SZ; j++){
+				for(int k = 0; k < NUM_FIL0; k++){
+					in[i][j][k] = input[i][j][k][iter];
+				}
+			}
+		}
+
+		pcount_enable(1);
+
+		dw_conv(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
+		pw_conv(outp_dim1, f_dim2, outp_dim2, out1, F2, B2, out2, STRIDE2, pad_2, SB2, MV2, SV2);
+		dw_conv(outp_dim2, f_dim3, outp_dim3, out2, F3, B3, out3, STRIDE3, pad_3, SB3, MV3, SV3);
+		pw_conv(outp_dim3, f_dim4, outp_dim4, out3, F4, B4, out4, STRIDE4, pad_4, SB4, MV4, SV4);
+		maxpool2(outp_dim4, outp_dim5, out4, out5, POOL_SIZE1, POOL_STRIDE1);
+
+		dw_conv(outp_dim5, f_dim6, outp_dim6, out5, F5, B5, out6, STRIDE5, pad_6, SB5, MV5, SV5);
+		pw_conv(outp_dim6, f_dim7, outp_dim7, out6, F6, B6, out7, STRIDE6, pad_7, SB6, MV6, SV6);
+		dw_conv(outp_dim7, f_dim8, outp_dim8, out7, F7, B7, out8, STRIDE7, pad_8, SB7, MV7, SV7);
+		pw_conv(outp_dim8, f_dim9, outp_dim9, out8, F8, B8, out9, STRIDE8, pad_9, SB8, MV8, SV8);
+		maxpool2(outp_dim9, outp_dim10, out9, out10, POOL_SIZE2, POOL_STRIDE2);
+
+		dw_conv(outp_dim10, f_dim11, outp_dim11, out10, F9, B9, out11, STRIDE9, pad_11, SB9, MV9, SV9);
+		pw_conv(outp_dim11, f_dim12, outp_dim12, out11, F10, B10, out12, STRIDE10, pad_12, SB10, MV10, SV10);
+		dw_conv(outp_dim12, f_dim13, outp_dim13, out12, F11, B11, out13, STRIDE11, pad_13, SB11, MV11, SV11);
+		pw_conv(outp_dim13, f_dim14, outp_dim14, out13, F12, B12, out14, STRIDE12, pad_14, SB12, MV12, SV12);
+		maxpool2(outp_dim14, outp_dim15, out14, out15, POOL_SIZE3, POOL_STRIDE3);
+
+		flatten(outp_dim15, out15, out16);
+
+		mlp_layer(out16, out, flatten_dim, OUT_DIM, W1, B13, SB13, MV13, SV13);
+		pcount_enable(0);
+
+		puts("Output Layer Values:\n");
+		for(int i = 0; i < OUT_DIM; i++) {
+			puthex(out[i]);
+			puts("\n");
+		}
+	}
+}
+
+int main(void) {
+
+	pcount_enable(0);
+
+	cifar10_dws_cnn();
+
+	return 0;
+}
@@ -0,0 +1,46 @@
+#ifndef IBEX_CNN_PARAMS_H
+#define IBEX_CNN_PARAMS_H
+
+#define MV1 75
+#define MV2 112
+#define MV3 82
+#define MV4 75
+#define MV5 87
+#define MV6 76
+#define MV7 111
+#define MV8 95
+#define MV9 95
+#define MV10 117
+#define MV11 68
+#define MV12 91
+#define MV13 77
+
+#define SV1 15
+#define SV2 7
+#define SV3 15
+#define SV4 14
+#define SV5 14
+#define SV6 8
+#define SV7 15
+#define SV8 15
+#define SV9 15
+#define SV10 16
+#define SV11 15
+#define SV12 15
+#define SV13 21
+
+#define SB1 0
+#define SB2 0
+#define SB3 0
+#define SB4 0
+#define SB5 0
+#define SB6 0
+#define SB7 0
+#define SB8 0
+#define SB9 0
+#define SB10 0
+#define SB11 0
+#define SB12 0
+#define SB13 0
+
+#endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Generate a baremetal application
+
+# Name of the program $(PROGRAM).c will be added as a source file
+
+PROGRAM = cmsis_cnn
+PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+# Any extra source files to include in the build. Use the upper case .S
+# extension for assembly files
+EXTRA_SRCS :=
+
+include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,153 @@
+#include "simple_system_common.h"
+#include "cnn_weights.h"
+#include "fully_connected_opt.h"
+#include "ibex_cnn_params.h"
+#include "ibex_inputs.h"
+#include "conv2d_opt.h"
+
+#define IMG_SZ 32
+#define NUM_FIL0 1
+
+#define FILTER1 5
+#define FILTER2 5
+#define FILTER3 5
+
+#define NUM_FIL1 8
+#define NUM_FIL2 8
+#define NUM_FIL3 16
+
+#define STRIDE1 1
+#define STRIDE2 1
+#define STRIDE3 1
+
+#define PAD_TB1 2
+#define PAD_LR1 2
+
+#define PAD_TB2 2
+#define PAD_LR2 2
+
+#define PAD_TB3 2
+#define PAD_LR3 2
+
+#define POOL_STRIDE1 2
+#define POOL_SIZE1 2
+
+#define POOL_STRIDE2 2
+#define POOL_SIZE2 2
+
+#define POOL_STRIDE3 2
+#define POOL_SIZE3 2
+
+#define OUT_DIM 3
+
+#define SAMPLES 1
+int outs[SAMPLES][OUT_DIM];
+
+void cmsis_cnn() {
+
+	int dout1 = NUM_FIL1;
+	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
+	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
+
+	int dout2 = dout1;
+	int hout2 = hout1/POOL_STRIDE1;
+	int wout2 = wout1/POOL_STRIDE1;
+
+	int dout3 = NUM_FIL2;
+	int hout3 = ((hout2 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
+	int wout3 = ((wout2 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
+
+	int dout4 = dout3;
+	int hout4 = hout3/POOL_STRIDE2;
+	int wout4 = wout3/POOL_STRIDE2;
+
+	int dout5 = NUM_FIL3;
+	int hout5 = ((hout4 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
+	int wout5 = ((wout4 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
+
+	int dout6 = dout5;
+	int hout6 = hout5/POOL_STRIDE3;
+	int wout6 = wout5/POOL_STRIDE3;
+
+	int flatten_dim = dout6 * hout6 * wout6;
+
+	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
+	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
+
+	int out1[hout1][wout1][dout1];
+	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
+	int outp_dim1[3] = {hout1, wout1, dout1};
+	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
+
+	int out2[hout2][wout2][dout2];
+	int outp_dim2[3] = {hout2, wout2, dout2};
+
+	int out3[hout3][wout3][dout3];
+	int pad_3[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
+	int outp_dim3[3] = {hout3, wout3, dout3};
+	int f_dim3[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
+
+	int out4[hout4][wout4][dout4];
+	int outp_dim4[3] = {hout4, wout4, dout4};
+
+	int out5[hout5][wout5][dout5];
+	int pad_5[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
+	int outp_dim5[3] = {hout5, wout5, dout5};
+	int f_dim5[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
+
+	int out6[hout6][wout6][dout6];
+	int outp_dim6[3] = {hout6, wout6, dout6};
+
+	int out7[flatten_dim];
+
+	int out[OUT_DIM];
+
+	for (int iter = 0; iter < SAMPLES; iter++){
+
+		for(int i = 0; i < IMG_SZ; i++){
+			for(int j = 0; j < IMG_SZ; j++){
+				for(int k = 0; k < NUM_FIL0; k++){
+					in[i][j][k] = input[i][j][k][iter];
+				}
+			}
+		}
+
+		pcount_enable(1);
+
+		conv2_8bits_1ch(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
+		maxpool2_compressed(outp_dim1, outp_dim2, out1, out2, POOL_SIZE1, POOL_STRIDE1);
+
+		conv2_8bits(outp_dim2, f_dim3, outp_dim3, out2, F2, B2, out3, STRIDE2, pad_3, SB2, MV2, SV2);
+		maxpool2_compressed(outp_dim3, outp_dim4, out3, out4, POOL_SIZE2, POOL_STRIDE2);
+
+		conv2_2bits(outp_dim4, f_dim5, outp_dim5, out4, F3, B3, out5, STRIDE3, pad_5, SB3, MV3, SV3);
+		maxpool2_compressed(outp_dim5, outp_dim6, out5, out6, POOL_SIZE3, POOL_STRIDE3);
+
+		flatten(outp_dim6, out6, out7);
+
+		mlp_layer_2bits(out7, out, flatten_dim, OUT_DIM, W1, B4, SB4, MV4, SV4);
+
+		pcount_enable(0);
+
+		puts("Output Layer Values:\n");
+		for(int i = 0; i < OUT_DIM; i++) {
+			puthex((out[i] & 0xFF000000) >> 24);
+			puts(" ");
+			puthex((out[i] & 0xFF0000) >> 16);
+			puts(" ");
+			puthex((out[i] & 0xFF00) >> 8);
+			puts(" ");
+			puthex(out[i] & 0xFF);
+			puts("\n");
+		}
+	}
+}
+
+int main(void) {
+
+	pcount_enable(0);
+
+	cmsis_cnn();
+
+	return 0;
+}
@@ -0,0 +1,30 @@
+#ifndef IBEX_CNN_PARAMS_H
+#define IBEX_CNN_PARAMS_H
+
+#define MV1 1953789044
+#define MV2 1229539657
+#define MV3 1212696648
+#define MV4 1330597711
+
+#define SV1 2164392961
+#define SV2 2299667521
+#define SV3 1488020161
+#define SV4 1623294721
+
+static const int SB1[8] = {
+	812696004, 946880900, 1079034308, 946913796, 945865156, 1081139524, 946930052, 545309060
+};
+
+static const int SB2[8] = {
+	945873216, 945832320, 945865152, 944816576, 674283904, 543211776, 945873280, 944824704
+};
+
+static const int SB3[16] = {
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static const int SB4[3] = {
+	3, 3, 3
+};
+
+#endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,15 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Generate a baremetal application
+
+# Name of the program $(PROGRAM).c will be added as a source file
+
+PROGRAM = cmsis_cnn
+PROGRAM_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+# Any extra source files to include in the build. Use the upper case .S
+# extension for assembly files
+EXTRA_SRCS :=
+
+include ${PROGRAM_DIR}/../../common/common.mk
@@ -0,0 +1,147 @@
+#include "simple_system_common.h"
+#include "cnn_weights.h"
+#include "fully_connected.h"
+#include "ibex_cnn_params.h"
+#include "ibex_inputs.h"
+#include "conv2d.h"
+
+#define IMG_SZ 32
+#define NUM_FIL0 3
+
+#define FILTER1 5
+#define FILTER2 5
+#define FILTER3 5
+
+#define NUM_FIL1 32
+#define NUM_FIL2 32
+#define NUM_FIL3 64
+
+#define STRIDE1 1
+#define STRIDE2 1
+#define STRIDE3 1
+
+#define PAD_TB1 2
+#define PAD_LR1 2
+
+#define PAD_TB2 2
+#define PAD_LR2 2
+
+#define PAD_TB3 2
+#define PAD_LR3 2
+
+#define POOL_STRIDE1 2
+#define POOL_SIZE1 2
+
+#define POOL_STRIDE2 2
+#define POOL_SIZE2 2
+
+#define POOL_STRIDE3 2
+#define POOL_SIZE3 2
+
+#define OUT_DIM 10
+
+#define SAMPLES 1
+int outs[SAMPLES][OUT_DIM];
+
+void cmsis_cnn() {
+
+	int dout1 = NUM_FIL1;
+	int hout1 = ((IMG_SZ - FILTER1 + 2 * PAD_TB1)/STRIDE1) + 1;
+	int wout1 = ((IMG_SZ - FILTER1 + 2 * PAD_LR1)/STRIDE1) + 1;
+
+	int dout2 = dout1;
+	int hout2 = hout1/POOL_STRIDE1;
+	int wout2 = wout1/POOL_STRIDE1;
+
+	int dout3 = NUM_FIL2;
+	int hout3 = ((hout2 - FILTER2+ 2 * PAD_TB2)/STRIDE2)+1;
+	int wout3 = ((wout2 - FILTER2+ 2 * PAD_LR2)/STRIDE2)+1;
+
+	int dout4 = dout3;
+	int hout4 = hout3/POOL_STRIDE2;
+	int wout4 = wout3/POOL_STRIDE2;
+
+	int dout5 = NUM_FIL3;
+	int hout5 = ((hout4 - FILTER3+ 2 * PAD_TB3)/STRIDE3)+1;
+	int wout5 = ((wout4 - FILTER3+ 2 * PAD_LR3)/STRIDE3)+1;
+
+	int dout6 = dout5;
+	int hout6 = hout5/POOL_STRIDE3;
+	int wout6 = wout5/POOL_STRIDE3;
+
+	int flatten_dim = dout6 * hout6 * wout6;
+
+	int in[IMG_SZ][IMG_SZ][NUM_FIL0];
+	int inp_dim[3] = {IMG_SZ, IMG_SZ, NUM_FIL0};
+
+	int out1[hout1][wout1][dout1];
+	int pad_1[4] = {PAD_TB1, PAD_TB1, PAD_LR1, PAD_LR1};
+	int outp_dim1[3] = {hout1, wout1, dout1};
+	int f_dim1[4] = {NUM_FIL1, FILTER1, FILTER1, NUM_FIL0};
+
+	int out2[hout2][wout2][dout2];
+	int outp_dim2[3] = {hout2, wout2, dout2};
+
+	int out3[hout3][wout3][dout3];
+	int pad_3[4] = {PAD_TB2, PAD_TB2, PAD_LR2, PAD_LR2};
+	int outp_dim3[3] = {hout3, wout3, dout3};
+	int f_dim3[4] = {NUM_FIL2, FILTER2, FILTER2, NUM_FIL1};
+
+	int out4[hout4][wout4][dout4];
+	int outp_dim4[3] = {hout4, wout4, dout4};
+
+	int out5[hout5][wout5][dout5];
+	int pad_5[4] = {PAD_TB3, PAD_TB3, PAD_LR3, PAD_LR3};
+	int outp_dim5[3] = {hout5, wout5, dout5};
+	int f_dim5[4] = {NUM_FIL3, FILTER3, FILTER3, NUM_FIL2};
+
+	int out6[hout6][wout6][dout6];
+	int outp_dim6[3] = {hout6, wout6, dout6};
+
+	int out7[flatten_dim];
+
+
+	int out[OUT_DIM];
+
+	for (int iter = 0; iter < SAMPLES; iter++){
+
+		for(int i = 0; i < IMG_SZ; i++){
+			for(int j = 0; j < IMG_SZ; j++){
+				for(int k = 0; k < NUM_FIL0; k++){
+					in[i][j][k] = input[i][j][k][iter];
+				}
+			}
+		}
+
+		pcount_enable(1);
+
+		conv2(inp_dim, f_dim1, outp_dim1, in, F1, B1, out1, STRIDE1, pad_1, SB1, MV1, SV1);
+		maxpool2(outp_dim1, outp_dim2, out1, out2, POOL_SIZE1, POOL_STRIDE1);
+
+		conv2(outp_dim2, f_dim3, outp_dim3, out2, F2, B2, out3, STRIDE2, pad_3, SB2, MV2, SV2);
+		maxpool2(outp_dim3, outp_dim4, out3, out4, POOL_SIZE2, POOL_STRIDE2);
+
+		conv2(outp_dim4, f_dim5, outp_dim5, out4, F3, B3, out5, STRIDE3, pad_5, SB3, MV3, SV3);
+		maxpool2(outp_dim5, outp_dim6, out5, out6, POOL_SIZE3, POOL_STRIDE3);
+
+		flatten(outp_dim6, out6, out7);
+
+		mlp_layer(out7, out, flatten_dim, OUT_DIM, W1, B4, SB4, MV4, SV4);
+		pcount_enable(0);
+
+		puts("Output Layer Values:\n");
+		for(int i = 0; i < OUT_DIM; i++) {
+			puthex(out[i]);
+			puts("\n");
+		}
+	}
+}
+
+int main(void) {
+
+	pcount_enable(0);
+
+	cmsis_cnn();
+
+	return 0;
+}
@@ -0,0 +1,19 @@
+#ifndef IBEX_CNN_PARAMS_H
+#define IBEX_CNN_PARAMS_H
+
+#define MV1 116
+#define MV2 73
+#define MV3 72
+#define MV4 79
+
+#define SV1 16
+#define SV2 17
+#define SV3 11
+#define SV4 12
+
+#define SB1 0
+#define SB2 0
+#define SB3 0
+#define SB4 0
+
+#endif /* IBEX_CNN_PARAMS_H */
@@ -0,0 +1,70 @@
+#ifndef DWS_CONV_H
+#define DWS_CONV_H
+
+void pw_conv(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
+	      const int fil[fil_dim[0]][fil_dim[3]], const int bias[], 
+	      int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], 
+              const int bias_shift_mode, const int quantized_multiplier, const int out_shift_rl){
+
+     int i, j, k, m, res, str1, str2, quant_prod;
+
+     for (i = 0; i < out_dim[2]; i++) {   // output depth
+           str1 = -pad[0] - strides;
+           for (j = 0; j < out_dim[0]; j++) {  // output height
+	        str1 += strides;
+	        str2 = -pad[2] - strides;
+	        for (k = 0; k < out_dim[1]; k++) {  // output width
+                    res = bias[i];
+                      str2 += strides;
+                      if (str1 < in_dim[0] && str1 >= 0 && str2 >= 0 && str2 < in_dim[1]) { 
+		           for (m = 0; m < fil_dim[3]; m++) {   // filters depth
+                                res += inp[str1][str2][m] * fil[i][m];
+                          }
+                      }
+                      quant_prod = quantized_multiplier * res + (1 << (out_shift_rl -1));
+        	      quant_prod = quant_prod >> (out_shift_rl);
+        	      if(quant_prod < 0) quant_prod = 0;
+        	      if(quant_prod > 255) quant_prod = 255;
+                      out[j][k][i] = quant_prod;
+	       }
+          }
+     }
+}
+
+void dw_conv(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
+            const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]][1], const int bias[], 
+			int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], 
+            const int bias_shift_mode, const int depthwise_multiplier, const int depthwise_out_shift_rl){
+    
+	int i, j, k, n, p, res, k1, k2, str1, str2, quant_prod;
+
+	// Depthwise convolution
+	for (i = 0; i < out_dim[2]; i++) {   // output depth
+		str1 = -pad[0] - strides;
+		for (j = 0; j < out_dim[0]; j++) {  // output height
+			str1 += strides;
+			str2 = -pad[2] - strides;
+			for (k = 0; k < out_dim[1]; k++) {  // output width
+				res = bias[i];
+				str2 += strides;
+				for (p = 0; p < depthwise_fil_dim[1]; p++){  // depthwise filter height
+					for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
+						k1 = str1 + p; 
+						k2 = str2 + n;
+                        
+						if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
+							res += inp[k1][k2][i] * depthwise_fil[i][p][n][0];
+						}
+					}
+				}
+				quant_prod = depthwise_multiplier * res + (1 << (depthwise_out_shift_rl -1));
+		        quant_prod = quant_prod >> (depthwise_out_shift_rl);
+				if(quant_prod < 0) quant_prod = 0;
+        		if(quant_prod > 255) quant_prod = 255;
+                out[j][k][i] = quant_prod;
+            }
+		}
+	}
+}
+
+#endif  /* DWS_CONV_H */
@@ -0,0 +1,171 @@
+#ifndef DWS_CONV_OPT_H
+#define DWS_CONV_OPT_H
+
+void pw_conv_8bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
+	const int fil[fil_dim[0]][fil_dim[3] << 2], const int bias[fil_dim[0]], 
+	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
+	const int quantized_multiplier, const int out_shift_rl){
+
+	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
+
+	for (i = 0; i < out_dim[2]; i++) {   // output depth
+		str1 = -pad[0] - strides;
+		for (j = 0; j < out_dim[0]; j++) {  // output height
+			str1 += 1;
+			str2 = -pad[2] - strides;
+	        	for (k = 0; k < out_dim[1]; k++) {  // output width
+                		bias_val = bias[i];
+                		str2 += 1;
+                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
+				
+				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
+                        		in_cnn = inp[str1][str2][m];
+                            		w = fil[i][4*m];
+                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                            					
+                            		w = fil[i][4*m+1];
+                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                            					
+                            		w = fil[i][4*m+2];
+                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                            					
+                            		w = fil[i][4*m+3];
+                            		asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                		}
+                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
+                    		out[j][k][i] = res;
+			}
+        	}
+	}
+}
+
+void pw_conv_4bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
+	const int fil[fil_dim[0]][fil_dim[3] << 1], const int bias[fil_dim[0]], 
+	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
+	const int quantized_multiplier, const int out_shift_rl){
+
+	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
+
+	for (i = 0; i < out_dim[2]; i++) {   // output depth
+		str1 = -pad[0] - strides;
+		for (j = 0; j < out_dim[0]; j++) {  // output height
+			str1 += 1;
+			str2 = -pad[2] - strides;
+	        	for (k = 0; k < out_dim[1]; k++) {  // output width
+                		bias_val = bias[i];
+                		str2 += 1;
+                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
+				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
+                        		in_cnn = inp[str1][str2][m];
+                            		w = fil[i][2*m];
+                            		asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                            					
+                            		w = fil[i][2*m+1];
+                            		asm volatile("nn_mac_4b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                    		}
+                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
+                    		out[j][k][i] = res;
+			}
+        	}
+	}
+}
+
+void pw_conv_2bits(int in_dim[3], int fil_dim[4], int out_dim[3], int inp[in_dim[0]][in_dim[1]][in_dim[2]], 
+	const int fil[fil_dim[0]][fil_dim[3]], const int bias[fil_dim[0]], 
+	int out[out_dim[0]][out_dim[1]][out_dim[2]], int strides, int pad[], const int bias_shift_mode[],
+	const int quantized_multiplier, const int out_shift_rl){
+
+	int i, j, k, m, res, str1, str2, bias_val, w, in_cnn;
+
+	for (i = 0; i < out_dim[2]; i++) {   // output depth
+		str1 = -pad[0] - strides;
+		for (j = 0; j < out_dim[0]; j++) {  // output height
+			str1 += 1;
+			str2 = -pad[2] - strides;
+	        	for (k = 0; k < out_dim[1]; k++) {  // output width
+                		bias_val = bias[i];
+                		str2 += 1;
+                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
+				for (m = 0; m < fil_dim[3]; m++) {   // filters depth
+                        		in_cnn = inp[str1][str2][m];
+                            		w = fil[i][m];
+                            		asm volatile("nn_mac_2b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                    		}
+                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
+                    		out[j][k][i] = res;
+			}
+        	}
+	}
+}
+
+void dw_conv_opt(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3],
+	int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]],
+	const int bias[depthwise_fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]],
+	int strides, int pad[], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
+    
+	int i, j, k, n, p, res, k1, k2, str1, str2, bias_val, in_cnn, w;
+
+    	// Depthwise convolution
+    	for (i = 0; i < out_dim[2]; i++){   // output depth
+        	str1 = -pad[0] - strides;
+        	for (j = 0; j < out_dim[0]; j++) {  // output height
+            		str1 += strides;
+            		str2 = -pad[2] - strides;
+            		for (k = 0; k < out_dim[1]; k++) {  // output width
+                		bias_val = bias[i];
+                		str2 += strides;
+                		asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[i]):);
+                		for (p = 0; p < depthwise_fil_dim[1]; p++) {  // depthwise filter height
+                    			for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
+                        			k1 = str1 + p; 
+                        			k2 = str2 + n;
+                        
+                        			if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
+                            				in_cnn = inp[k1][k2][i];
+                            				w = depthwise_fil[i][p][n];
+                            				asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                        			}
+                    			}
+                		}
+                		asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
+                    		out[j][k][i] = res;
+            		}
+        	}
+    	}
+}
+
+void dw_conv_opt_1ch(int in_dim[3], int depthwise_fil_dim[4], int out_dim[3],
+	int inp[in_dim[0]][in_dim[1]][in_dim[2]], const int depthwise_fil[depthwise_fil_dim[0]][depthwise_fil_dim[1]][depthwise_fil_dim[2]],
+	const int bias[depthwise_fil_dim[0]], int out[out_dim[0]][out_dim[1]][out_dim[2]],
+	int strides, int pad[], const int bias_shift_mode[], const int quantized_multiplier, const int out_shift_rl){
+    
+        int j, k, n, p, res, k1, k2, str1, str2, bias_val, in_cnn, w;
+
+    	// Depthwise convolution
+        str1 = -pad[0] - strides;
+        for (j = 0; j < out_dim[0]; j++) {  // output height
+             str1 += strides;
+             str2 = -pad[2] - strides;
+             for (k = 0; k < out_dim[1]; k++) {  // output width
+                  bias_val = bias[0];
+                  str2 += strides;
+                  asm volatile("neur_init %0, %1, %2\n":"=r"(res):"r"(bias_val),"r"(bias_shift_mode[0]):);
+                  for (p = 0; p < depthwise_fil_dim[1]; p++) {  // depthwise filter height
+                       for (n = 0; n < depthwise_fil_dim[2]; n++) {  // depthwise filter width
+                            k1 = str1 + p; 
+                            k2 = str2 + n;
+                        
+                            if (k1 < in_dim[0] && k1 >= 0 && k2 >= 0 && k2 < in_dim[1]) { 
+                                 in_cnn = inp[k1][k2][0];
+                            	 w = depthwise_fil[0][p][n];
+                            	 asm volatile("nn_mac_8b %0, %1,%2\n":"=r"(res):"r"(w),"r"(in_cnn):);
+                             }
+                        }
+                   }
+                   asm volatile("neur_res %0, %1, %2\n":"=r"(res):"r"(quantized_multiplier),"r"(out_shift_rl):);
+                   out[j][k][0] = res;
+           }
+      }
+}
+
+#endif  /* DWS_CONV_OPT_H */
@@ -0,0 +1,89 @@
+import init_utils
+import common
+
+# Initialize the environment and get the name
+name = init_utils.initialize_environment(__file__)
+args = init_utils.get_args()
+
+# Set arguments from command line
+max_acc_drop = args.max_acc_drop
+device = args.device
+
+from sklearn.model_selection import train_test_split
+import torch.nn as nn
+import torch.nn.functional as F
+import tensorflow as tf
+import numpy as np
+
+# Load our Dataset
+
+(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()
+
+y_train = np.squeeze(y_train, axis = 1)
+y_test = np.squeeze(y_test, axis = 1)
+
+X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.15)
+
+X_train = (np.transpose(X_train, (0,3,1,2)))
+X_test = (np.transpose(X_test, (0,3,1,2)))
+X_val = (np.transpose(X_val, (0,3,1,2)))
+
+BATCH_SIZE = 128
+epochs = 1
+lr = 0.0001
+
+class DepthwiseBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(DepthwiseBlock, self).__init__()
+        
+        layers = []
+                    
+        layers.append(nn.Conv2d(in_channels = in_channels, out_channels = in_channels, 
+                                    kernel_size = 3, padding = 1, groups = in_channels))  # Depthwise convolution
+        
+        layers.append(nn.ReLU(inplace = True))
+        
+        layers.append(nn.Conv2d(in_channels = in_channels, out_channels = out_channels, 
+                                    kernel_size = 1, padding = 0))  # Pointwise convolution
+            
+        layers.append(nn.ReLU(inplace = True))
+                            
+        self.block = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.block(x)
+    
+class Cifar10_Dws_CNN(nn.Module):
+    def __init__(self):
+        super(Cifar10_Dws_CNN, self).__init__()
+        self.features = nn.Sequential(
+            DepthwiseBlock(in_channels = 3, out_channels = 64),
+            DepthwiseBlock(in_channels = 64, out_channels = 64),
+            nn.MaxPool2d(kernel_size = 2, stride = 2),
+
+            DepthwiseBlock(in_channels = 64, out_channels = 128),
+            DepthwiseBlock(in_channels = 128, out_channels = 128),
+            nn.MaxPool2d(kernel_size = 2, stride = 2),
+            
+            DepthwiseBlock(in_channels = 128, out_channels = 256),
+            DepthwiseBlock(in_channels = 256, out_channels = 256),
+            nn.MaxPool2d(kernel_size = 2, stride = 2)
+        )
+        
+        self.flatten = nn.Flatten()
+        
+        self.classifier = nn.Sequential(
+            nn.Linear(256 * 4 * 4, 10)  # Assuming input size is (32, 32) and after 3 max pooling layers, the size is (4, 4)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.flatten(x)
+        x = self.classifier(x)
+        return F.log_softmax(x, dim = 1)
+
+net = Cifar10_Dws_CNN()
+
+common.create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, 
+                X_val = X_val, y_val = y_val, BATCH_SIZE = BATCH_SIZE, 
+                epochs = epochs, lr = lr, max_acc_drop = max_acc_drop)
@@ -0,0 +1,77 @@
+import init_utils
+import common
+
+# Initialize the environment and get the name
+name = init_utils.initialize_environment(__file__)
+args = init_utils.get_args()
+
+# Set arguments from command line
+max_acc_drop = args.max_acc_drop
+device = args.device
+
+from sklearn.model_selection import train_test_split
+import torch.nn as nn
+import torch.nn.functional as F
+import tensorflow as tf
+import numpy as np
+
+# Load our Dataset
+
+(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()
+y_train = np.squeeze(y_train, axis = 1)
+y_test = np.squeeze(y_test, axis = 1)
+
+X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.15)
+
+X_train = (np.transpose(X_train, (0,3,1,2)) - 128.0)/255.0
+X_test = (np.transpose(X_test, (0,3,1,2)) - 128.0)/255.0
+X_val = (np.transpose(X_val, (0,3,1,2)) - 128.0)/255.0
+
+BATCH_SIZE = 32
+epochs = 1
+lr = 0.0001
+
+class CMSIS_CNN(nn.Module):
+    def __init__(self):
+        super(CMSIS_CNN, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 5, padding = 2)
+        self.relu1 = nn.ReLU()
+        self.max1 = nn.MaxPool2d(2,2)
+        self.d1 = nn.Dropout(p = 0.25)
+        
+        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 5, padding = 2)
+        self.relu2 = nn.ReLU()
+        self.max2 = nn.MaxPool2d(2,2)
+        self.d2 = nn.Dropout(p = 0.25)
+        
+        self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, padding = 2)
+        self.relu3 = nn.ReLU()
+        self.max3 = nn.MaxPool2d(2,2)
+        self.d3 = nn.Dropout(p = 0.4)
+        
+        self.flatten = nn.Flatten()
+        self.linear1 = nn.Linear(1024, 10)
+        
+    def forward(self,X):
+        X = self.relu1((self.conv1(X)))
+        X = self.max1(X)
+        X = self.d1(X)
+        
+        X = self.relu2((self.conv2(X)))
+        X = self.max2(X)
+        X = self.d2(X)
+        
+        X = self.relu3((self.conv3(X)))
+        X = self.max3(X)
+        X = self.d3(X)
+        
+        X = self.flatten(X)
+
+        X = self.linear1(X)
+        return F.log_softmax(X, dim = 1)
+
+net = CMSIS_CNN()
+
+common.create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, 
+                X_val = X_val, y_val = y_val, BATCH_SIZE = BATCH_SIZE, 
+                epochs = epochs, lr = lr, max_acc_drop = max_acc_drop)
@@ -125,3 +125,13 @@ def create_ibex_qnn(net, name, device, X_train, y_train, X_test, y_test, X_val =
        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
        ibex_model = simulate_ibex.create_lenet_model(int_weights, int_og_bias, mul_vals, shift_vals)
        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
+        
+    elif(name == 'cmsis_cnn'):
+        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
+        ibex_model = simulate_ibex.create_cmsis_cnn_model(int_weights, int_og_bias, mul_vals, shift_vals)
+        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
+
+    elif(name == 'cifar10_dws_cnn'):
+        print('\nSIMULATING MODEL ON IBEX CORE\nUSE THE OUTPUTS TO VERIFY THAT THE RESULTS ARE CORRECT !!')
+        ibex_model = simulate_ibex.create_ibex_dws_model(int_weights, int_og_bias, mul_vals, shift_vals)
+        simulate_ibex.eval_sim_model(quant_net, ibex_model, test_loader)
@@ -27,26 +27,34 @@ def quantize_multiplier(real_multiplier):
    return quantized_multiplier, right_shift

 def get_int_params(quant_net):
+    
    int_weights = []
    int_bias = []
    in_scales = []
    act_scales = []
    
-    for _, module in quant_net.sequential.named_children():
-        if hasattr(module, 'weight') and module.weight is not None:
-            int_weights.append(module.int_weight().cpu().numpy())
-            int_bias.append(module.int_bias().cpu().numpy())
-            in_scales.append(module.quant_bias_scale().cpu().detach().numpy())
+    def extract_quant_params(module):
+        for name, submodule in module.named_children():
+            # Check if the submodule has weights and append them if present
+            if hasattr(submodule, 'weight') and submodule.weight is not None:
+                int_weights.append(submodule.int_weight().cpu().detach().numpy())
+                int_bias.append(submodule.int_bias().cpu().detach().numpy())
+                in_scales.append(submodule.quant_bias_scale().cpu().detach().numpy())

-        if hasattr(module, 'quant_act_scale') and module.quant_act_scale is not None:
-            act_scales.append(module.quant_act_scale().cpu().detach().numpy())
+            # Check if the submodule has activation scale and append it if present
+            if hasattr(submodule, 'quant_act_scale') and submodule.quant_act_scale is not None:
+                act_scales.append(submodule.quant_act_scale().cpu().detach().numpy())

-    act_scales.append(quant_net.o_quant.quant_act_scale().cpu().detach().numpy())
+            # Recursively extract parameters from the children modules
+            extract_quant_params(submodule)
+
+    # Start extraction from the top-level module
+    extract_quant_params(quant_net)
    
    mul_vals, shift_vals = [], []
    
-    for i in range(len(act_scales)):
-        M = in_scales[i]/act_scales[i]
+    for i in range(len(act_scales)-1):
+        M = in_scales[i]/act_scales[i+1]
        mul, shift = quantize_multiplier(M[0])
        mul_vals.append(mul)
        shift_vals.append(shift)
@@ -87,7 +95,12 @@ def decide_mode(network, weight_bit_width, input_uint8 = True):
    for name, module in network.named_modules():
        if isinstance(module, layer_types_py):
            layer_type_name = module.__class__.__name__
-            if(layer_type_name == 'Conv2d' or layer_type_name == 'Linear' or layer_type_name == 'DepthwiseConv2d'):
+            if(layer_type_name == 'Linear'):
+                layer_type.append(layer_type_name)
+            if(layer_type_name == 'Conv2d'):
+                if(module.groups == module.in_channels):
+                    layer_type.append('DepthwiseConv2d')
+                else:
                    layer_type.append(layer_type_name)
            else:
                if(layer_type_name == 'ReLU' or layer_type_name == 'Sigmoid'):
@@ -96,13 +109,13 @@ def decide_mode(network, weight_bit_width, input_uint8 = True):
        
    for i in range(len(weight_bit_width)):
        signed_input = 4 * input_sign[i]
+        if(layer_type[i] == 'DepthwiseConv2d'):
+                mode_per_layer.append(signed_input + 1)
+        else:
            if(weight_bit_width[i] == 2):
                mode_per_layer.append(signed_input + 3)
            elif(weight_bit_width[i] == 4):
                mode_per_layer.append(signed_input + 2)
-        else:
-            if(layer_type[i] == 'DepthwiseConv2d'):
-                mode_per_layer.append(signed_input + 1)
            else:
                mode_per_layer.append(signed_input)

@@ -161,6 +174,7 @@ def pad_inputs_weights(quant_net, test_loader, mode_per_layer,
            else:
                new_size_0 = a * 4
        
+            if((mode_per_layer[i] != 1) and (mode_per_layer[i] != 5)):
                b = w.shape[1] // 4
                if(w.shape[1] % 4 != 0):
                    new_size_1 = (b + 1) * 4
@@ -170,6 +184,12 @@ def pad_inputs_weights(quant_net, test_loader, mode_per_layer,
                new_w = np.zeros((new_size_0, new_size_1, w.shape[2], w.shape[3])).astype(np.int8)
                new_w[:w.shape[0], :w.shape[1], :, :] = w
            
+            else:
+                new_size_1 = 1
+                new_w = np.zeros((new_size_0, new_size_1, w.shape[2], w.shape[3])).astype(np.int8)
+                new_w[:w.shape[0], :w.shape[1], :, :] = w
+                new_w = np.squeeze(new_w, axis = 1)
+                
        padded_int_weights.append(new_w)

    padded_int_biases = []
@@ -325,6 +345,15 @@ def concat_inputs_weights(mode_per_layer, padded_input, padded_int_weights, padd
                        comb = combine_values(vector)
                        new_mat[i][j] = comb

+        elif(len(dims) == 3):
+            new_mat = np.zeros((int(dims[0]//4), dims[1], dims[2]), dtype = np.int64)
+            for i in range(int(dims[0]//4)):
+                    for j in range(dims[1]):
+                        for k in range(dims[2]):
+                            vector = layer_weight[4*i : 4*(i+1), j, k]
+                            comb = combine_values(vector)
+                            new_mat[i][j][k] = comb
+                            
        elif(len(dims) == 4):
            if((mode_per_layer[iter] == 0) | (mode_per_layer[iter] == 4)):
                new_mat = np.zeros((int(dims[0]//4), dims[1], dims[2], dims[3]), dtype = np.int64)
@@ -602,9 +631,17 @@ def save_cnn_net_params(path, int_weights, int_biases, mul_vals, shift_vals, shi
            dims = np.shape(int_weights[k])
            mat = int_weights[k]   
            
+            if(len(dims) == 2 or ((len(dims) == 4) and dims[2] == dims[3] == 1)):
+                f.write('static const int ')
                if(len(dims) == 2):
                    wi += 1
-                st = 'static const int W' + str(wi) + '[' + str(dims[0]) + ']' + '[' + str(dims[1]) + '] = {\n'
+                    f.write('W' + str(wi))                
+                else:
+                    mat = np.squeeze(mat, axis = (2,3))
+                    fi += 1
+                    f.write('F' + str(fi))
+                    
+                st = '[' + str(dims[0]) + ']' + '[' + str(dims[1]) + '] = {\n'
                f.write(st)
                for n in range(dims[0]):
                    f.write('\t{')
@@ -619,6 +656,32 @@ def save_cnn_net_params(path, int_weights, int_biases, mul_vals, shift_vals, shi
                    f.write('\n')
                f.write('};\n\n')
            
+            elif (len(dims) == 3):
+                dims = np.shape(mat)
+                fi += 1
+                st = 'static const int F' + str(fi) + '[' + str(dims[0]) + '][' + str(dims[1])
+                st += '][' + str(dims[2]) + '] = {\n'
+                f.write(st)
+
+                for n in range(dims[0]):
+                    f.write('\t{\n')
+                    for l in range(dims[1]):
+                        f.write('\t\t{')
+                        for h in range(dims[2] - 1):
+                            f.write(str(mat[n][l][h]) + ', ')
+                        if dims[2] != 1:
+                            f.write(str(mat[n][l][dims[2] - 1]) + '}')
+                        else:
+                            f.write(str(mat[n][l][0]) + '}')
+                        if (l != dims[1] - 1):
+                            f.write(',')
+                        f.write('\n')
+                    f.write('\t}')
+                    if n != dims[0] - 1:
+                        f.write(',')
+                    f.write('\n')
+                f.write('};\n\n')
+            
            elif(len(dims) == 4):
                mat = np.transpose(mat, (0, 2, 3, 1))
                dims = np.shape(mat)
@@ -856,9 +919,11 @@ def generate_opt_c_code_mlp(path, name, int_weights, optimal_config, type_of_lay
        f.write('\t' + name + '();\n\n')
        f.write('\treturn 0;\n}')

-def get_cnn_details(model):
+def get_cnn_details(module, details = None):
+    if details is None:
        details = []
-    for layer in model.children():
+
+    for layer in module.children():
        if isinstance(layer, nn.Conv2d):
            details.append({
                "layer_type": "Conv2d",
@@ -866,10 +931,11 @@ def get_cnn_details(model):
                "out_channels": layer.out_channels,
                "kernel_size": layer.kernel_size,
                "stride": layer.stride,
-                "padding": layer.padding
+                "padding": layer.padding,
+                "groups": layer.groups
            })

-        elif (isinstance(layer, nn.MaxPool2d)):
+        elif isinstance(layer, nn.MaxPool2d):
            details.append({
                "layer_type": "MaxPool2d",
                "kernel_size": layer.kernel_size,
@@ -877,7 +943,7 @@ def get_cnn_details(model):
                "padding": layer.padding
            })

-        elif (isinstance(layer, nn.AvgPool2d)):
+        elif isinstance(layer, nn.AvgPool2d):
            details.append({
                "layer_type": "AvgPool2d",
                "kernel_size": layer.kernel_size,
@@ -891,6 +957,10 @@ def get_cnn_details(model):
                "in_features": layer.in_features,
                "out_features": layer.out_features
            })
+
+        # Recursively apply to children modules
+        get_cnn_details(layer, details)
+
    return details

 def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
@@ -900,10 +970,17 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
        f.write('#include "fully_connected.h"\n')
        f.write('#include "ibex_cnn_params.h"\n')
        f.write('#include "ibex_inputs.h"\n')
-        f.write('#include "conv2d.h"\n\n')
+        f.write('#include "conv2d.h"\n')

-        f.write('#define IMG_SZ ' + str(input.shape[2]) + '\n')
-        f.write('#define NUM_FIL0 ' + str(int_weights[0].shape[1]) + '\n\n')
+        for detail in cnn_details[:-1]:
+            if detail["layer_type"] == "Conv2d":
+                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
+                    f.write('#include "dws_conv.h"\n')
+                    break
+        
+        f.write('\n')
+        f.write('#define IMG_SZ ' + str(np.shape(input)[2]) + '\n')
+        f.write('#define NUM_FIL0 ' + str(np.shape(input)[1]) + '\n\n')
        i = 1
        for w in int_weights:
            if(len(np.shape(w)) == 4):
@@ -1050,11 +1127,17 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):

        for detail in cnn_details[:-1]:
            if detail["layer_type"] == "Conv2d":
+                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
+                    conv_type = 'dw_conv'
+                elif(detail["kernel_size"][0] == 1):
+                    conv_type = 'pw_conv'
+                else:
+                    conv_type = "conv2"
                if(i == 1):
-                    f.write('\t\tconv2(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
+                    f.write('\t\t' + conv_type + '(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
                    f.write('out1, STRIDE1, pad_1, SB1, MV1, SV1);')
                else:
-                    f.write('\t\tconv2(outp_dim' + str(i-1) + ', f_dim' + str(i) + ', outp_dim' + str(i))
+                    f.write('\t\t' + conv_type + '(outp_dim' + str(i-1) + ', f_dim' + str(i) + ', outp_dim' + str(i))
                    f.write(', out' + str(i-1) + ', F' + str(fi) + ', B' + str(fi) + ', out' + str(i))
                    f.write(', STRIDE' + str(fi) + ', pad_' + str(i) + ', SB' + str(fi))
                    f.write(', MV' + str(fi) + ', SV' + str(fi) + ');')
@@ -1091,6 +1174,13 @@ def generate_og_c_code_cnn(path, name, input, cnn_details, int_weights):
            f.write('\n')
            i += 1
        
+        if flatten == 0:
+            f.write('\t\tflatten(outp_dim' + str(i-1) + ', out' + str(i-1) + ', out' + str(i) + ');\n\n')
+            i += 1
+            f.write('\t\tmlp_layer(out' + str(i-1) + ', out, flatten_dim, OUT_DIM, ')
+            f.write('W1, B' + str(fi + dn - 1) +  ', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
+            f.write(', SV' + str(fi + dn - 1) + ');')
+        else:
            f.write('\t\tmlp_layer(out' + str(i-1) + ', out, DENSE_DIM' + str(dn-1))
            f.write(', OUT_DIM, W' + str(dn) + ', B' + str(fi + dn - 1))
            f.write(', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
@@ -1119,13 +1209,21 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
        f.write('#include "fully_connected_opt.h"\n')
        f.write('#include "ibex_cnn_params.h"\n')
        f.write('#include "ibex_inputs.h"\n')
-        f.write('#include "conv2d_opt.h"\n\n')
+        f.write('#include "conv2d_opt.h"\n')
+        
+        for detail in cnn_details[:-1]:
+            if detail["layer_type"] == "Conv2d":
+                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
+                    f.write('#include "dws_conv_opt.h"\n')
+                    break
+                
+        f.write('\n')
        
        f.write('#define IMG_SZ ' + str(np.shape(input)[2]) + '\n')
-        f.write('#define NUM_FIL0 ' + str(np.shape(input)[0]) + '\n\n')
+        f.write('#define NUM_FIL0 ' + str(np.shape(input)[1]) + '\n\n')
        i = 1
        for w in int_weights:
-            if(len(np.shape(w)) == 4):
+            if(len(np.shape(w)) == 4 or len(np.shape(w)) == 3):
                f.write('#define FILTER' + str(i) + ' ' + str(w.shape[2]) + '\n')
                i += 1

@@ -1133,7 +1231,7 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
        
        i = 1
        for w in int_weights:
-            if(len(np.shape(w)) == 4):
+            if(len(np.shape(w)) == 4 or len(np.shape(w)) == 3):
                f.write('#define NUM_FIL' + str(i) + ' ' + str(w.shape[0]) + '\n')
                i += 1

@@ -1270,14 +1368,21 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal

        for detail in cnn_details[:-1]:
            if detail["layer_type"] == "Conv2d":
+                if(detail["in_channels"] == detail["out_channels"] == detail["groups"] != 1):
+                    conv_type = 'dw_conv_opt'
+                elif(detail["kernel_size"][0] == 1):
+                    conv_type = 'pw_conv_' + str(optimal_config[j]) + 'bits'
+                else:
+                    conv_type = 'conv2_' + str(optimal_config[j]) + 'bits'
+                    
                if(i == 1):
-                    f.write('\t\tconv2_' + str(optimal_config[j]) + 'bits')
-                    if(np.shape(input)[0] == 1):
+                    f.write('\t\t' + conv_type)
+                    if(np.shape(input)[1] == 1):
                        f.write('_1ch')
                    f.write('(inp_dim, f_dim1, outp_dim1, in, F1, B1, ')
                    f.write('out1, STRIDE1, pad_1, SB1, MV1, SV1);')
                else:
-                    f.write('\t\tconv2_' + str(optimal_config[j]) + 'bits(outp_dim' + str(i-1) + ', f_dim' + str(i))
+                    f.write('\t\t' + conv_type + '(outp_dim' + str(i-1) + ', f_dim' + str(i))
                    f.write(', outp_dim' + str(i) + ', out' + str(i-1) + ', F' + str(fi) + ', B' + str(fi) + ', out')
                    f.write(str(i) + ', STRIDE' + str(fi) + ', pad_' + str(i) + ', SB' + str(fi))
                    f.write(', MV' + str(fi) + ', SV' + str(fi) + ');')
@@ -1314,11 +1419,19 @@ def generate_opt_c_code_cnn(path, name, input, cnn_details, int_weights, optimal
            f.write('\n')
            i += 1
        
+        if flatten == 0:
+            f.write('\t\tflatten(outp_dim' + str(i-1) + ', out' + str(i-1) + ', out' + str(i) + ');\n\n')
+            i += 1
+            f.write('\t\tmlp_layer_' + str(optimal_config[j]) + 'bits(out' + str(i-1) + ', out, ')
+            f.write('flatten_dim, OUT_DIM, W1, B' + str(fi + dn - 1) +  ', SB' + str(fi + dn - 1) + ', MV')
+            f.write(str(fi + dn - 1) + ', SV' + str(fi + dn - 1) + ');\n')
+        else:
            f.write('\t\tmlp_layer_' + str(optimal_config[-1]) + 'bits(out' + str(i-1) + ', out, DENSE_DIM' + str(dn-1))
            f.write(', OUT_DIM, W' + str(dn) + ', B' + str(fi + dn - 1))
            f.write(', SB' + str(fi + dn - 1) + ', MV' + str(fi + dn - 1))
            f.write(', SV' + str(fi + dn - 1) + ');\n')

+        f.write('\n\t\tpcount_enable(0);\n\n')
        f.write('\t\tputs("Output Layer Values:\\n");\n')
        f.write('\t\tfor(int i = 0; i < OUT_DIM; i++) {\n')
        f.write('\t\t\tputhex((out[i] & 0xFF000000) >> 24);\n')
@@ -11,6 +11,9 @@ from torch import nn, optim

 import brevitas.nn as qnn
 from brevitas.quant import *
+from brevitas.core.restrict_val import RestrictValueType
+
+from collections import defaultdict
 from torchinfo import summary

 def net_input_size(X_train):
@@ -202,7 +205,21 @@ def generate_sequences(length, values = [2, 4, 8]):

 def create_weight_confs(macc_per_layer):
    total_macc_opt = []
-    weights_per_layer = generate_sequences(len(macc_per_layer))
+    
+    cc = 0 
+    idx = []
+    
+    if(len(macc_per_layer) >= 6):
+        for i, mpl in enumerate(macc_per_layer):
+            if(mpl/max(macc_per_layer) < 0.05):
+                cc += 1
+                idx.append(i)
+    
+    weights_per_layer = generate_sequences(len(macc_per_layer) - cc)
+    
+    for w in weights_per_layer:
+        for i in idx:
+            w.insert(i, 8)
    
    for w_conf in weights_per_layer:
        macc = 0
@@ -234,24 +251,47 @@ def create_weight_confs(macc_per_layer):
 # Define a mapping from PyTorch layers to Brevitas layers
 def create_layer_mapping(bit_width):
    mapping = {
-        nn.Conv2d: lambda layer, bw: qnn.QuantConv2d(in_channels = layer.in_channels, 
+        nn.Conv2d: lambda layer, bw: (qnn.QuantConv2d(in_channels=layer.in_channels, 
                                                        out_channels=layer.out_channels, 
                                                        kernel_size=layer.kernel_size, 
                                                        stride=layer.stride[0], 
                                                        padding=layer.padding,
+                                                        groups=layer.groups,
                                                        bias=True,
                                                        cache_inference_bias=True,
                                                        bias_quant=Int32Bias,
                                                        weight_bit_width=bw,
-                                                    weight_quant = Int8WeightPerTensorFloat),
+                                                        weight_quant=Int8WeightPerTensorFloat,
+                                                        weight_scaling_min_val=2e-16,
+                                                        restrict_scaling_type=RestrictValueType.LOG_FP,
+                                                        return_quant_tensor=True
+                                                        ) if layer.groups != layer.in_channels else (
+                                                            # Special case for depthwise convolutions
+                                        qnn.QuantConv2d(in_channels=layer.in_channels, 
+                                                                out_channels=layer.out_channels, 
+                                                                kernel_size=layer.kernel_size, 
+                                                                stride=layer.stride[0], 
+                                                                padding=layer.padding,
+                                                                groups=layer.groups,
+                                                                bias=True,
+                                                                cache_inference_bias=True,
+                                                                bias_quant=Int32Bias,
+                                                                weight_bit_width=8,  # Fixed bit width for depthwise convolutions
+                                                                weight_quant=Int8WeightPerTensorFloat,
+                                                                weight_scaling_min_val=2e-16,
+                                                                restrict_scaling_type=RestrictValueType.LOG_FP,
+                                                                return_quant_tensor=True))),

        nn.Linear: lambda layer, bw: qnn.QuantLinear(in_features = layer.in_features, 
                                                    out_features = layer.out_features, 
+                                                     
                                                    cache_inference_bias = True,
-                                                    weight_quant = Int8WeightPerTensorFloat,
                                                    bias_quant = Int32Bias,
                                                    bias = True,
-                                                    weight_bit_width = bw),
+                                                    
+                                                    weight_quant = Int8WeightPerTensorFloat, 
+                                                    weight_bit_width = bw,
+                                                    return_quant_tensor=True),

        nn.ReLU: lambda _, bw: qnn.QuantReLU(bit_width = bw, 
                                            return_quant_tensor = True),
@@ -278,13 +318,11 @@ def convert_layer(layer, bit_width, layer_mapping):
        return layer

 # Function to convert a PyTorch model to a Brevitas model
-def convert_model(module, bit_widths, layer_mapping):
-    layer_idx = [0]
+def convert_model(module, bit_widths, layer_mapping, layer_idx = [0]):
    brevitas_module = nn.Sequential()
-
    for name, layer in module.named_children():
        if list(layer.children()):  # If the layer has children, recurse
-            brevitas_module.add_module(name, convert_model(layer, bit_widths, layer_mapping))
+            brevitas_module.add_module(name, convert_model(layer, bit_widths, layer_mapping, layer_idx))
        else:
            layer_type = type(layer)
            if layer_type in [nn.Conv2d, nn.Linear]:
@@ -293,6 +331,7 @@ def convert_model(module, bit_widths, layer_mapping):
            else:
                bit_width = 8
            brevitas_module.add_module(name, convert_layer(layer, bit_width, layer_mapping))
+
    return brevitas_module

 class Quant_Model(nn.Module):
@@ -300,13 +339,15 @@ class Quant_Model(nn.Module):
        super(Quant_Model, self).__init__()
        if(input_sign):
            self.quant_inp = qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True,
-                         act_quant = Uint8ActPerTensorFloat)
+                         act_quant = Uint8ActPerTensorFloat, scaling_min_val = 2e-16, 
+                                        restrict_scaling_type = RestrictValueType.LOG_FP)
    
        else:
            self.quant_inp = qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True,
-                         act_quant = Int8ActPerTensorFloat)
+                         act_quant = Int8ActPerTensorFloat, scaling_min_val = 2e-16, 
+                                        restrict_scaling_type = RestrictValueType.LOG_FP)

-        self.sequential = convert_model(og_model, w, layer_mapping)
+        self.sequential = convert_model(og_model, w, layer_mapping, [0])
        self.o_quant =  qnn.QuantIdentity(bit_width = 8, return_quant_tensor = True)
    
    def forward(self, X):
@@ -315,6 +356,35 @@ class Quant_Model(nn.Module):
        X = self.o_quant(X)
        return F.log_softmax(X, dim = 1)

+def count_layers_in_sequential(module):
+    # List to store the counts of Conv2d and Linear layers for each nn.Sequential module
+    sequential_counts = []
+
+    def _count_layers(submodule, prefix = ''):
+        if isinstance(submodule, nn.Sequential):
+            conv_count = 0
+            linear_count = 0
+            # Count Conv2d and Linear layers in the current nn.Sequential module
+            for child in submodule.children():
+                if isinstance(child, nn.Conv2d):
+                    conv_count += 1
+                elif isinstance(child, nn.Linear):
+                    linear_count += 1
+            # Append the counts to the list
+            sequential_counts.append((conv_count, linear_count))
+            # Recursively process children of the current nn.Sequential module
+            for name, child in submodule.named_children():
+                child_prefix = f"{prefix}.{name}" if prefix else name
+                _count_layers(child, child_prefix)
+        else:
+            # Process children of non-nn.Sequential modules
+            for name, child in submodule.named_children():
+                _count_layers(child, prefix)
+
+    _count_layers(module)
+    
+    return sequential_counts[1:]
+
 def train_quant_model(quant_net, train_loader, val_loader = None, device = 'cpu',
                      epochs = 20, lr = 0.0001):
    
@@ -392,6 +462,7 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
        device = 'cpu', epochs = 5, lr = 0.0001):
    
    sign = calculate_minimum(train_loader) >= 0
+    seq_counts = count_layers_in_sequential(og_model)

    if max_acc_drop is not None:
        print('\nDSE STARTING ... BINARY SEARCH')
@@ -402,6 +473,16 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
            mid = (low + high) // 2
            w = weights_per_layer[mid]
            
+            f_w = []
+            for i in range(len(seq_counts)):
+                t_w = w[i]
+                c,l = seq_counts[i]
+                for j in range(c+l):
+                    f_w.append(t_w)
+
+            if(len(seq_counts) > 0):
+                w = f_w
+
            # Create and train the quantized network
            layer_mapping = create_layer_mapping(w)
            quant_net = Quant_Model(og_model, w, layer_mapping, sign)
@@ -436,6 +517,16 @@ def dse(og_model, max_acc_drop, weights_per_layer, fp_accuracy, train_loader, te
        print('\nDSE STARTING ... EXHAUSTIVE SEARCH')
        test_accuracy = []
        for i, w in enumerate(weights_per_layer):
+            f_w = []
+            for i in range(len(seq_counts)):
+                t_w = w[i]
+                c,l = seq_counts[i]
+                for j in range(c+l):
+                    f_w.append(t_w)
+
+            if(len(seq_counts) > 0):
+                w = f_w
+                
            layer_mapping = create_layer_mapping(w)
            quant_net = Quant_Model(og_model, w, layer_mapping, sign)
            quant_net = quant_net.to(device)
@@ -151,15 +151,154 @@ class Ibex_Lenet5(nn.Module):

        return X

+class Ibex_CMSIS_CNN(nn.Module):
+    def __init__(self, mul_vals, shift_vals):
+        super(Ibex_CMSIS_CNN, self).__init__()
+        
+        self.m0 = mul_vals[0]
+        self.m1 = mul_vals[1]
+        self.m2 = mul_vals[2]
+        self.m3 = mul_vals[3]
+        
+        self.s0 = shift_vals[0] + 7
+        self.s1 = shift_vals[1] + 7
+        self.s2 = shift_vals[2] + 7
+        self.s3 = shift_vals[3] + 7
+        
+        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 5, padding = 'same')
+        self.max1 = nn.MaxPool2d(2,2)
+        
+        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 5, padding = 'same')
+        self.max2 = nn.MaxPool2d(2,2)
+        
+        self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, padding = 'same')
+        self.max3 = nn.MaxPool2d(2,2)
+        
+        self.linear1 = nn.Linear(1024, 10)
+        
+    def forward(self, X, print_out = False):
+        
+        X = self.conv1(X)
+        X = torch.mul(X, self.m0)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s0 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s0).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+        
+        X = self.max1(X)
+        
+        X = self.conv2(X)
+        X = torch.mul(X, self.m1)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s1 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s1).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+        
+        X = self.max2(X)
+        
+        X = self.conv3(X)
+        X = torch.mul(X, self.m2)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s2 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s2).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+        
+        X = self.max3(X)
+        
+        X = X.reshape(X.shape[0], -1)
+        X = self.linear1(X)
+        X = torch.mul(X, self.m3)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s3 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s3).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+
+        if(print_out):
+            print(X)
+        return X
+
+class Ibex_DepthwiseBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, mul_vals, shift_vals):
+        super(Ibex_DepthwiseBlock, self).__init__()
+                            
+        self.dw = nn.Conv2d(in_channels = in_channels, out_channels = in_channels, 
+                                    kernel_size = 3, padding = 1, groups = in_channels)
+                
+        self.pw = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, 
+                                    kernel_size = 1, padding = 0)
+            
+        self.m0 = mul_vals[0]
+        self.m1 = mul_vals[1]
+        
+        self.s0 = shift_vals[0] + 7
+        self.s1 = shift_vals[1] + 7
+        
+    def forward(self, X):
+        X = self.dw(X)
+        X = torch.mul(X, self.m0)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s0 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s0).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+        
+        X = self.pw(X)
+        X = torch.mul(X, self.m1)
+        X = torch.add(X, torch.bitwise_left_shift(torch.tensor(1), self.s1 -1)).type(torch.LongTensor)
+        X = torch.bitwise_right_shift(X, self.s1).type(torch.FloatTensor)
+        X = torch.clamp(X, min = 0, max = 255)
+        
+        return X
+    
+class Ibex_Cifar10_Dws_CNN(nn.Module):
+    def __init__(self, mul_vals, shift_vals):
+        super(Ibex_Cifar10_Dws_CNN, self).__init__()
+        self.features = nn.Sequential(
+            Ibex_DepthwiseBlock(3, 64, mul_vals[0:2], shift_vals[0:2]),
+            Ibex_DepthwiseBlock(64, 64, mul_vals[2:4], shift_vals[2:4]),
+            nn.MaxPool2d(kernel_size = 2, stride = 2),
+
+            Ibex_DepthwiseBlock(64, 128, mul_vals[4:6], shift_vals[4:6]),
+            Ibex_DepthwiseBlock(128, 128, mul_vals[6:8], shift_vals[6:8]),
+            nn.MaxPool2d(kernel_size = 2, stride = 2),
+            
+            Ibex_DepthwiseBlock(128, 256, mul_vals[8:10], shift_vals[8:10]),
+            Ibex_DepthwiseBlock(256, 256, mul_vals[10:12], shift_vals[10:12]),
+            nn.MaxPool2d(kernel_size = 2, stride = 2)
+        )
+        
+        self.flatten = nn.Flatten()
+        
+        self.classifier = nn.Sequential(
+            nn.Linear(256 * 4 * 4, 10)
+        )
+        
+        self.m_cl = mul_vals[12]
+        self.s_cl = shift_vals[12] + 7
+
+    def forward(self, x, print_out = False):
+        x = self.features(x)
+        x = self.flatten(x)
+        x = self.classifier(x)
+        
+        x = torch.mul(x, self.m_cl)
+        x = torch.add(x, torch.bitwise_left_shift(torch.tensor(1), self.s_cl - 1)).type(torch.LongTensor)
+        x = torch.bitwise_right_shift(x, self.s_cl).type(torch.FloatTensor)
+        x = torch.clamp(x, min = 0, max = 255)
+        
+        if(print_out):
+            print(x)
+            
+        return x
+
+def configure_network(ibex_model_dict, int_weights, int_biases):
+    for i, (name, _) in enumerate(ibex_model_dict.items()):
+        if(i%2 == 0):
+            ibex_model_dict[name] =  torch.tensor(int_weights[i//2])
+        else:
+            ibex_model_dict[name] = torch.tensor(int_biases[i//2])
+
+    return ibex_model_dict
+
 def create_fann_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_FANN(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()

-    ibex_model_dict['linear1.weight'] = torch.tensor(int_weights[0])
-    ibex_model_dict['linear2.weight'] = torch.tensor(int_weights[1])
-
-    ibex_model_dict['linear1.bias'] = torch.tensor(int_biases[0])
-    ibex_model_dict['linear2.bias'] = torch.tensor(int_biases[1])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)

    ibex_model.load_state_dict(ibex_model_dict)
    return ibex_model
@@ -168,15 +307,7 @@ def create_uci_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_UCI_MLP(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()

-    ibex_model_dict['fc0.weight'] = torch.tensor(int_weights[0])
-    ibex_model_dict['fc1.weight'] = torch.tensor(int_weights[1])
-    ibex_model_dict['fc2.weight'] = torch.tensor(int_weights[2])
-    ibex_model_dict['fc3.weight'] = torch.tensor(int_weights[3])
-
-    ibex_model_dict['fc0.bias'] = torch.tensor(int_biases[0])
-    ibex_model_dict['fc1.bias'] = torch.tensor(int_biases[1])
-    ibex_model_dict['fc2.bias'] = torch.tensor(int_biases[2])
-    ibex_model_dict['fc3.bias'] = torch.tensor(int_biases[3])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)

    ibex_model.load_state_dict(ibex_model_dict)

@@ -186,24 +317,33 @@ def create_lenet_model(int_weights, int_biases, mul_vals, shift_vals):
    ibex_model = Ibex_Lenet5(mul_vals, shift_vals)
    ibex_model_dict = ibex_model.state_dict()

-    ibex_model_dict['conv1.weight'] = torch.tensor(int_weights[0])
-    ibex_model_dict['conv2.weight'] = torch.tensor(int_weights[1])
-    ibex_model_dict['fc1.weight'] = torch.tensor(int_weights[2])
-    ibex_model_dict['fc2.weight'] = torch.tensor(int_weights[3])
-    ibex_model_dict['fc3.weight'] = torch.tensor(int_weights[4])
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)

-    ibex_model_dict['conv1.bias'] = torch.tensor(int_biases[0])
-    ibex_model_dict['conv2.bias'] = torch.tensor(int_biases[1])
-    ibex_model_dict['fc1.bias'] = torch.tensor(int_biases[2])
-    ibex_model_dict['fc2.bias'] = torch.tensor(int_biases[3])
-    ibex_model_dict['fc3.bias'] = torch.tensor(int_biases[4])
+    ibex_model.load_state_dict(ibex_model_dict)
+
+    return ibex_model
+
+def create_cmsis_cnn_model(int_weights, int_biases, mul_vals, shift_vals):
+    ibex_model = Ibex_CMSIS_CNN(mul_vals, shift_vals)
+    ibex_model_dict = ibex_model.state_dict()
+
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
+
+    ibex_model.load_state_dict(ibex_model_dict)
+
+    return ibex_model
+
+def create_ibex_dws_model(int_weights, int_biases, mul_vals, shift_vals):
+    ibex_model = Ibex_Cifar10_Dws_CNN(mul_vals, shift_vals)
+    ibex_model_dict = ibex_model.state_dict()
+
+    ibex_model_dict = configure_network(ibex_model_dict, int_weights, int_biases)
    
    ibex_model.load_state_dict(ibex_model_dict)
    
    return ibex_model

 def eval_sim_model(quant_model, ibex_model, test_loader):
-    # Turn off gradients for validation
    with torch.no_grad():
        ibex_model.eval()
        correct = 0