Repository: cea-wind/SimpleTPU Branch: master Commit: b81f32e563ca Files: 21 Total size: 45.1 KB Directory structure: gitextract_zygrtfz8/ ├── README.md ├── data/ │ └── golden_result.txt ├── lab1/ │ ├── README.md │ ├── refcode/ │ │ ├── conv3d.m │ │ ├── convmxu.m │ │ └── saveparam.m │ ├── run_hls.tcl │ └── src/ │ ├── mxu.cpp │ ├── tb_mxu.cpp │ └── tpu.h ├── lab2/ │ ├── README.md │ ├── run_hls.tcl │ └── src/ │ ├── relu_norm_pool.cpp │ ├── tb_pool.cpp │ └── tpu.h └── src/ ├── ctrl.cpp ├── mxu.cpp ├── norm_relu_pool.cpp ├── tb_tpu.cpp ├── tpu.cpp └── tpu.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # SimpleTPU A Tensor Processing Unit is designed to accelerate the matrix multiplication, especially for Multilayer perceptron and Convolution Nerual Network. This implmentaion is mainly following the Google TPU Version 1, which architecture is introduced in [https://arxiv.org/ftp/arxiv/papers/1704/1704.04760.pdf](https://arxiv.org/ftp/arxiv/papers/1704/1704.04760.pdf "In-Datacenter Performance Analysis of a Tensor Processing Unit"). It may cost a lot of time to implementation TPU using Hardware Description Language (such as VHDL or Verilog HDL), even if I had tried to simplify it. So I try to use the Xilinx HLS ToolKit to complete it. The plan is divided into three phases. - Phase 1: Completing the main computing module,including - Lab1:Systolic Array - Lab2:Relu, Normalization & Pooling - Phase 2: Finish the full design of simpleTPU. - Phase 3: Testing the simpleTPU through some real network, such as MLP and CNN. # Key Features The key features of Simple TPU including - Int8 mulitply & Int32 accumulators - VLIW based instruction parallel - Vector Architecture based data parallel Here are some operate which Simple TPU can support. Operate | Support -|- Conv3d | in_channels: Resource Constrained
out_channels: Resource Constrained
kerner_size: Support
stride: support
padding: Support
dilation:Support
groups: Architecture Constrained
bias :Support ConvTranspose3d | The same as above Maxpool2d | kernel_size: Support
stride: Support
padding: Support Avgpool2d | The same as above Relu | Only support Relu as nonlinear function BatchNorm2d | BatchNorm2d is merge with Conv or Pool when inference Linear | Resource Constrained UpscalingNearest2D | Support (calling Avgpool2d multiple times.) UpscalingBilinear2D | Support (calling Avgpool2d multiple times.) # Performance The size of mac array in SimpleTPU is 32*32, the clock frequency is 500MHz (timing closure when using Xilinx ultrascale+ FPGA, Speed -2). $$32\times 32 \times 500 \times 2 = 1 Tops(int8)$$ # Installation **env** : - Vivado HLS 2018.2 **run** : - step1: `vivado_hls -f run_hls.tcl` - step2: lanch vivado HLS and open the project - step3: Run C synthesis, C/RTL cosimulation e.t.c **Synthesis Result**: ![result](./pictures/syn.png) **Simulation Result**: ![result](./pictures/sim.png) # Examlpes ## 1. MLP The network structure of mlp is defined as follow. ``` class MLP(nn.Module): def __init__(self): super(MLP, self).__init__() self.hidden = nn.Linear(784,64) self.fc = nn.Linear(64,10) def forward(self, x): x = x.view(-1,784) x = self.hidden(x) x = self.fc(x) return F.log_softmax(x, dim=1) ``` Work efficiency of SimpleTPU is about 84%. |LOC| Layers | Nonlinear function | Weights | Batch Size | % of Deployed| |---|---|---|----|----|----| |10 | 2 FC | Relu | 5M | 512 | 16%| Classfication Result in MNIST. ![result](./pictures/cla_result.png) ## 2. CNN Because there is no compiler to generate instruction, this plan was suspended. If you want to kown how to calculate convolution using SimpleTPU, lab1 provides a simple example. # Relative Link https://www.cnblogs.com/sea-wind/p/10993958.html ================================================ FILE: data/golden_result.txt ================================================ 7 2 1 0 4 1 4 9 6 9 0 6 9 0 1 5 9 7 6 4 9 6 6 5 4 0 7 4 0 1 3 1 3 6 7 2 7 1 2 1 1 7 4 2 6 5 1 2 4 4 6 3 5 5 6 0 4 1 9 5 7 8 4 2 7 4 6 4 3 0 7 0 2 9 1 7 3 7 9 7 9 6 2 7 8 4 7 5 6 1 3 6 4 3 1 4 1 1 6 9 6 0 5 4 9 9 2 1 4 4 8 1 3 9 7 4 4 4 9 2 5 4 7 6 4 9 0 5 8 5 6 6 5 2 8 1 0 1 6 4 6 7 3 1 9 1 8 2 0 9 9 9 5 5 1 5 6 0 3 4 4 6 5 4 6 5 4 5 1 4 4 7 2 3 2 1 1 8 1 8 1 8 5 0 8 9 2 5 0 1 1 1 0 4 0 5 1 6 4 2 3 6 1 1 1 3 9 5 2 9 4 5 9 3 9 0 3 6 5 5 7 2 2 7 1 2 8 4 1 7 3 3 8 9 7 9 2 2 4 1 5 8 8 4 2 6 0 6 4 2 4 1 9 5 7 7 2 8 2 0 8 1 7 7 9 1 8 1 8 0 3 0 1 9 9 4 1 8 2 1 2 9 7 5 9 2 6 4 1 5 4 2 9 2 0 4 0 0 2 8 6 2 1 2 4 0 2 9 4 3 3 0 0 5 1 9 6 4 0 5 1 7 9 3 0 4 2 0 7 1 1 2 1 5 3 3 4 7 8 6 6 4 1 3 5 1 0 5 1 9 1 5 0 6 1 8 5 1 9 4 4 6 7 1 5 0 6 5 6 3 7 2 0 8 8 5 4 1 1 4 0 7 3 7 6 1 6 2 1 4 2 8 6 1 9 5 2 5 4 4 2 8 3 9 2 4 6 0 3 1 7 7 3 7 9 7 1 9 2 1 4 2 9 2 0 4 9 1 4 8 1 8 4 4 9 8 8 3 7 6 0 0 3 0 8 0 6 4 8 5 3 3 2 3 9 1 2 6 8 0 5 6 6 6 9 8 8 2 2 5 8 9 6 1 8 4 1 2 8 3 1 9 7 5 4 0 8 9 9 1 0 5 2 3 7 8 9 4 0 6 3 9 1 2 1 8 1 5 6 5 2 1 ================================================ FILE: lab1/README.md ================================================ # Systolic Array Systolic Array implement in FPGA using Xilinx HLS. ## 1.Env & Build **env** : - Vivado HLS 2018.2 or 2016.3 , MATLAB 2014a(for matlabcode) **run** : - step1: `vivado_hls -f run_hls.tcl` - step2: lanch vivado HLS and open the project - step3: Run C synthesis, C/RTL cosimulation e.t.c ## 2.Relative Link https://www.cnblogs.com/sea-wind/p/10995360.html ================================================ FILE: lab1/refcode/conv3d.m ================================================ rng(0); feature = randi([-128,127],14,14,32); weight = randi([-128,127],32,3,3,32); bias = randi([-1024,1023],1,32); output = zeros(14,14,32); saveparam(feature,weight,bias) out1 = convmxu(weight,feature,bias,2,2); out2 = convmxu(weight,feature,zeros(1,32),1,1); out3 = convmxu(weight,feature,zeros(1,32),1,2); out4 = convmxu(weight,feature,zeros(1,32),1,3); out5 = convmxu(weight,feature,zeros(1,32),2,1); out6 = convmxu(weight,feature,zeros(1,32),2,3); out7 = convmxu(weight,feature,zeros(1,32),3,1); out8 = convmxu(weight,feature,zeros(1,32),3,2); out9 = convmxu(weight,feature,zeros(1,32),3,3); output = out1; output(2:end,2:end,:) = output(2:end,2:end,:) + out2(1:end-1,1:end-1,:); output(2:end,:,:) = output(2:end,:,:) + out3(1:end-1,:,:); output(2:end,1:end-1,:) = output(2:end,1:end-1,:) + out4(1:end-1,2:end,:); output(:,2:end,:) = output(:,2:end,:) + out5(:,1:end-1,:); output(:,1:end-1,:) = output(:,1:end-1,:) + out6(:,2:end,:); output(1:end-1,2:end,:) = output(1:end-1,2:end,:) + out7(2:end,1:end-1,:); output(1:end-1,:,:) = output(1:end-1,:,:) + out8(2:end,:,:); output(1:end-1,1:end-1,:) = output(1:end-1,1:end-1,:) + out9(2:end,2:end,:); golden = zeros(14,14,32); for k = 1:32 wk = reshape(weight(k,:,:,:),3,3,32); wk = wk(end:-1:1,end:-1:1,end:-1:1); tmp = convn(feature,wk,'same'); golden(:,:,k) = tmp(:,:,16)+bias(k); end golden = int32(golden); fid = fopen('golden.dat','wb'); for i=1:14 for j=1:14 fwrite(fid,golden(i,j,:),'int32'); end end fclose(fid); ================================================ FILE: lab1/refcode/convmxu.m ================================================ function [out1] = convmxu(weight,feature,bias,index1,index2) %UNTITLED3 Summary of this function goes here % Detailed explanation goes here out1 = zeros(14,14,32); for i = 1:14 for j = 1:14 for k = 1:32 for c = 1:32 if(c==1) out1(i,j,k) = bias(k) + weight(k,index1,index2,c)*feature(i,j,c); else out1(i,j,k) = out1(i,j,k) + weight(k,index1,index2,c)*feature(i,j,c); end end end end end end ================================================ FILE: lab1/refcode/saveparam.m ================================================ function [] = saveparam(feature,weight,bias) %UNTITLED2 Summary of this function goes here % Detailed explanation goes here feature = int8(feature); weight = int8(weight); bias = int32(bias); bias4 = bitand(bitshift(bias,-24),int32(255)); bias3 = bitand(bitshift(bias,-16),int32(255)); bias2 = bitand(bitshift(bias,-8),int32(255)); bias1 = bitand(bias,int32(255)); fid = fopen('feature.dat','wb'); for i=1:14 for j=1:14 fwrite(fid,feature(i,j,:),'int8'); end end fclose(fid); fid = fopen('weight.dat','wb'); for k=1:32 fwrite(fid,weight(:,2,2,k),'int8'); end fwrite(fid,uint8(bias4),'uint8'); fwrite(fid,uint8(bias3),'uint8'); fwrite(fid,uint8(bias2),'uint8'); fwrite(fid,uint8(bias1),'uint8'); for i=1:3 for j=1:3 for k=1:32 if(~(i==2&&j==2)) fwrite(fid,weight(:,i,j,k),'int8'); end end if(~(i==2&&j==2)) for k=1:32 fwrite(fid,0,'int32'); end end end end fclose(fid); end ================================================ FILE: lab1/run_hls.tcl ================================================ open_project -reset mxu_conv_prj set_top MXU add_files src/tpu.h add_files src/mxu.cpp add_files -tb data/feature.dat add_files -tb data/golden.dat add_files -tb data/weight.dat add_files -tb src/tb_mxu.cpp open_solution -reset "solution1" set_part {xczu7cg-fbvb900-2-i} -tool vivado create_clock -period 2.5 -name default csim_design # Do not perform any other steps # - The basic project will be opened in the GUI exit ================================================ FILE: lab1/src/mxu.cpp ================================================ #include "tpu.h" void SetWeight(WEIGHTDTYPE weight[512][MXU_COLNUM],WEIGHTDTYPE weightreg[MXU_ROWNUM+4][MXU_COLNUM], short weight_raddr, bool enable){ if(!enable) return; for(short i=weight_raddr;i=0;k--){ if(k>0) featreg[j][k] = featreg[j][k-1]; else if(i=0;j--){ for(int k=0;k biasreg; biasreg(31,24)=weightreg[MXU_ROWNUM+0][k]; biasreg(23,16)=weightreg[MXU_ROWNUM+1][k]; biasreg(15, 8)=weightreg[MXU_ROWNUM+2][k]; biasreg( 7, 0)=weightreg[MXU_ROWNUM+3][k]; if(j==0) psumreg[j][k] = featreg[j][k+j]*weightreg[j][k] + biasreg; else psumreg[j][k] = featreg[j][k+j]*weightreg[j][k] + psumreg[j-1][k]; } } #pragma HLS DEPENDENCE variable=psum inter false #pragma HLS DEPENDENCE variable=psum intra false for(int j=0;j=j+MXU_ROWNUM-1&&ipsumpool[j]) psumpool[j] = psumrelu[j]; } else{ psumpool[j] = psumpool[j] + psumrelu[j]; } } if(pool_kw_cnt==param.pool_kw&&pool_kh_cnt==param.pool_kh){ short ubuf_waddr = param.ubuf_waddr_start + ubuf_waddr_p1 + ubuf_waddr_p2 + ubuf_waddr_p3; if(ubuf_waddr_p1==param.ubuf_waddr_end1){ if(ubuf_waddr_p2==param.ubuf_waddr_end2){ ubuf_waddr_p2 = 0; ubuf_waddr_p3 = ubuf_waddr_p3 + param.ubuf_waddr_step3; } else{ ubuf_waddr_p2 = ubuf_waddr_p2 + param.ubuf_waddr_step2; } } else{ ubuf_waddr_p1 = ubuf_waddr_p1 + param.ubuf_waddr_step1; } for(int j=0;j>32; ap_int<8> res; if(tmpcut>127) res = 127; else if(tmpcut<-128) res = -128; else res = tmpcut; unified_buffer[ubuf_waddr][j] = res; } } if(pool_kw_cnt==param.pool_kw){ pool_kw_cnt = 0; if(pool_kh_cnt==param.pool_kh){ pool_kh_cnt = 0; if(pool_w_cnt==param.pool_w){ pool_w_cnt = 0; pool_h_cnt = pool_h_cnt + param.pool_sh; } else{ pool_w_cnt = pool_w_cnt + param.pool_sw; } } else{ pool_kh_cnt = pool_kh_cnt + 1; } } else{ pool_kw_cnt = pool_kw_cnt + 1; } } } ================================================ FILE: lab2/src/tb_pool.cpp ================================================ #include "tpu.h" #include "stdio.h" #include "stdlib.h" int main(){ PSUMDTYPE psum_buffer[512][MXU_COLNUM]; FEATDTYPE unified_buffer[16384][MXU_ROWNUM]; int norm_coef[MXU_COLNUM]; RELPOOL_PARAM param; for(int i=0;i<14;i++){ for(int j=0;j<14;j++){ for(int c=0;c<32;c++){ psum_buffer[i*14+j][c] = (i*14+j+c)*512; } } } for(int c=0;c<32;c++) norm_coef[c] = 1<<23; // no pooling param.isrelu = true; param.psum_raddr_start = 0; param.maxpool = true; param.pool_kw = 0; param.pool_kh = 0; param.pool_w = 14-1; param.pool_sw = 1; param.pool_sh = 1; param.pool_cnt = 14*14; param.pool_h_step = 14; param.ubuf_waddr_start = 0; param.ubuf_waddr_step1 = 1; param.ubuf_waddr_end1 = 14*14-1; relu_norm_pool(psum_buffer,unified_buffer,norm_coef,param); FEATDTYPE golden[14*14][MXU_ROWNUM]; for(int i=0;i<14;i++){ for(int j=0;j<14;j++){ for (int k=0;k<32;k++){ int tmp = psum_buffer[i*14+j][k]/512; tmp = tmp>127?127:tmp; tmp = tmp<-128?-128:tmp; golden[i*14+j][k] = tmp; } } } int err=0; for(int i=0;i<14*14;i++){ for(int k=0;k<32;k++){ if(golden[i][k]!=unified_buffer[i][k]) err ++; } } // max pooling 2,2 for(int c=0;c<32;c++) norm_coef[c] = 1<<23; param.isrelu = true; param.psum_raddr_start = 0; param.maxpool = true; param.pool_kw = 1; param.pool_kh = 1; param.pool_w = 12; param.pool_sw = 2; param.pool_sh = 2; param.pool_cnt = 14*14; param.pool_h_step = 14; param.ubuf_waddr_start = 0; param.ubuf_waddr_step1 = 1; param.ubuf_waddr_end1 = 7*7-1; relu_norm_pool(psum_buffer,unified_buffer,norm_coef,param); for(int i=0;i<7;i++){ for(int j=0;j<7;j++){ for (int k=0;k<32;k++){ int tmp = -128; for(int i1=0;i1<2;i1++){ for(int j1=0;j1<2;j1++){ if(tmp127?127:tmp; tmp = tmp<-128?-128:tmp; golden[i*7+j][k] = tmp; } } } for(int i=0;i<7*7;i++){ for(int k=0;k<32;k++){ if(golden[i][k]!=unified_buffer[i][k]) err ++; } } for(int c=0;c<32;c++) norm_coef[c] = 171196; // avg pooling 7,7 param.isrelu = true; param.psum_raddr_start = 0; param.maxpool = false; param.pool_kw = 6; param.pool_kh = 6; param.pool_w = 7; param.pool_sw = 7; param.pool_sh = 7; param.pool_cnt = 14*14; param.pool_h_step = 14; param.ubuf_waddr_start = 0; param.ubuf_waddr_step1 = 1; param.ubuf_waddr_end1 = 14*14-1; relu_norm_pool(psum_buffer,unified_buffer,norm_coef,param); for(int i=0;i<2;i++){ for(int j=0;j<2;j++){ for (int k=0;k<32;k++){ int tmp = 0; for(int i1=0;i1<7;i1++){ for(int j1=0;j1<7;j1++){ tmp += psum_buffer[(i*7+i1)*14+7*j+j1][k]; } } tmp = (long(tmp)*long(171196))>>32; tmp = tmp>127?127:tmp; tmp = tmp<-128?-128:tmp; golden[i*2+j][k] = tmp; } } } for(int i=0;i<2*2;i++){ for(int k=0;k<32;k++){ if(golden[i][k]!=unified_buffer[i][k]) err ++; } } return err; } ================================================ FILE: lab2/src/tpu.h ================================================ #include "ap_int.h" #define MXU_COLNUM 32 #define MXU_ROWNUM 32 #define WEIGHTDTYPE char #define FEATDTYPE char #define PSUMDTYPE int struct MXU_PARAM{ bool isload; bool iscalc; bool isping; bool isfirstpsum; short weight_raddr; short ubuf_raddr_start; short ubuf_raddr_step1; short ubuf_raddr_step2; short ubuf_raddr_step3; short ubuf_raddr_end1; short ubuf_raddr_end2; short ubuf_raddr_end3; short ubuf_raddr_num; short psum_start; short psum_step1; short psum_end1; short psum_step2; }; struct RELPOOL_PARAM{ bool isrelu; short psum_raddr_start; bool maxpool; // max pool or average pool char pool_kw; char pool_kh; char pool_w; char pool_sw; char pool_sh; short pool_cnt; // output_num*pool_kw*pool_kh short pool_h_step; short ubuf_waddr_start; short ubuf_waddr_step1; short ubuf_waddr_step2; short ubuf_waddr_step3; short ubuf_waddr_end1; short ubuf_waddr_end2; short ubuf_waddr_end3; }; void MXU(FEATDTYPE ubuf[16384][MXU_ROWNUM],WEIGHTDTYPE weight[512][MXU_COLNUM], PSUMDTYPE psum[512][MXU_COLNUM],MXU_PARAM mxuparam); void relu_norm_pool(PSUMDTYPE psum_buffer[512][MXU_COLNUM],FEATDTYPE unified_buffer[16384][MXU_ROWNUM], int norm_coef[MXU_COLNUM],RELPOOL_PARAM param); ================================================ FILE: src/ctrl.cpp ================================================ #include "tpu.h" void loadWeight(ap_uint<256> *ddr,WEIGHTDTYPE weight_buffer[512][MXU_COLNUM], unsigned offset,short addr, short len, bool enable){ if(!enable) return; for(int i=0;i tmp = ddr[offset+i]; for(int j=0;j<32;j++){ weight_buffer[addr+i][j] = tmp(j*8+7,j*8); } } } void loadFeature(ap_uint<256> *ddr,FEATDTYPE unified_buffer[512][MXU_ROWNUM], unsigned offset,short addr, short len, bool enable){ if(!enable) return; for(int i=0;i tmp = ddr[offset+i]; for(int j=0;j<32;j++){ unified_buffer[addr+i][j] = tmp(j*8+7,j*8); } } } void storeFeature(ap_uint<256> *ddr,FEATDTYPE unified_buffer[512][MXU_COLNUM], unsigned offset,short addr, short len, bool enable){ if(!enable) return; for(int i=0;i tmp; for(int j=0;j<32;j++){ tmp(j*8+7,j*8) = unified_buffer[addr+i][j];; } ddr[offset+i] = tmp; } } //set instr. set register //run instr. run process //eop instr. end of process // void instr(ap_uint<64> *ddr,unsigned &offset,ap_int<16> reggroup[96],ap_int<8> &runmode,bool enable){ #pragma HLS INTERFACE m_axi depth=8192 port=ddr #pragma HLS ARRAY_PARTITION variable=reggroup complete dim=1 if(!enable) return; bool isRunInstr = false; while(!isRunInstr){ ap_uint<64> tmp = ddr[offset]; offset++; if(tmp[63]==0){ switch(tmp(52,48)){ case( 0):reggroup[ 0] = tmp(15, 0);reggroup[ 1] = tmp(31,16);reggroup[ 2] = tmp(47,32);break; case( 1):reggroup[ 3] = tmp(15, 0);reggroup[ 4] = tmp(31,16);reggroup[ 5] = tmp(47,32);break; case( 2):reggroup[ 6] = tmp(15, 0);reggroup[ 7] = tmp(31,16);reggroup[ 8] = tmp(47,32);break; case( 3):reggroup[ 9] = tmp(15, 0);reggroup[10] = tmp(31,16);reggroup[11] = tmp(47,32);break; case( 4):reggroup[12] = tmp(15, 0);reggroup[13] = tmp(31,16);reggroup[14] = tmp(47,32);break; case( 5):reggroup[15] = tmp(15, 0);reggroup[16] = tmp(31,16);reggroup[17] = tmp(47,32);break; case( 6):reggroup[18] = tmp(15, 0);reggroup[19] = tmp(31,16);reggroup[20] = tmp(47,32);break; case( 7):reggroup[21] = tmp(15, 0);reggroup[22] = tmp(31,16);reggroup[23] = tmp(47,32);break; case( 8):reggroup[24] = tmp(15, 0);reggroup[25] = tmp(31,16);reggroup[26] = tmp(47,32);break; case( 9):reggroup[27] = tmp(15, 0);reggroup[28] = tmp(31,16);reggroup[29] = tmp(47,32);break; case(10):reggroup[30] = tmp(15, 0);reggroup[31] = tmp(31,16);reggroup[32] = tmp(47,32);break; case(11):reggroup[33] = tmp(15, 0);reggroup[34] = tmp(31,16);reggroup[35] = tmp(47,32);break; case(12):reggroup[36] = tmp(15, 0);reggroup[37] = tmp(31,16);reggroup[38] = tmp(47,32);break; case(13):reggroup[39] = tmp(15, 0);reggroup[40] = tmp(31,16);reggroup[41] = tmp(47,32);break; case(14):reggroup[42] = tmp(15, 0);reggroup[43] = tmp(31,16);reggroup[44] = tmp(47,32);break; case(15):reggroup[45] = tmp(15, 0);reggroup[46] = tmp(31,16);reggroup[47] = tmp(47,32);break; case(16):reggroup[48] = tmp(15, 0);reggroup[49] = tmp(31,16);reggroup[50] = tmp(47,32);break; case(17):reggroup[51] = tmp(15, 0);reggroup[52] = tmp(31,16);reggroup[53] = tmp(47,32);break; case(18):reggroup[54] = tmp(15, 0);reggroup[55] = tmp(31,16);reggroup[56] = tmp(47,32);break; case(19):reggroup[57] = tmp(15, 0);reggroup[58] = tmp(31,16);reggroup[59] = tmp(47,32);break; case(20):reggroup[60] = tmp(15, 0);reggroup[61] = tmp(31,16);reggroup[62] = tmp(47,32);break; case(21):reggroup[63] = tmp(15, 0);reggroup[64] = tmp(31,16);reggroup[65] = tmp(47,32);break; case(22):reggroup[66] = tmp(15, 0);reggroup[67] = tmp(31,16);reggroup[68] = tmp(47,32);break; case(23):reggroup[69] = tmp(15, 0);reggroup[70] = tmp(31,16);reggroup[71] = tmp(47,32);break; case(24):reggroup[72] = tmp(15, 0);reggroup[73] = tmp(31,16);reggroup[74] = tmp(47,32);break; case(25):reggroup[75] = tmp(15, 0);reggroup[76] = tmp(31,16);reggroup[77] = tmp(47,32);break; case(26):reggroup[78] = tmp(15, 0);reggroup[79] = tmp(31,16);reggroup[80] = tmp(47,32);break; case(27):reggroup[81] = tmp(15, 0);reggroup[82] = tmp(31,16);reggroup[83] = tmp(47,32);break; case(28):reggroup[84] = tmp(15, 0);reggroup[85] = tmp(31,16);reggroup[86] = tmp(47,32);break; case(29):reggroup[87] = tmp(15, 0);reggroup[88] = tmp(31,16);reggroup[89] = tmp(47,32);break; case(30):reggroup[90] = tmp(15, 0);reggroup[91] = tmp(31,16);reggroup[92] = tmp(47,32);break; case(31):reggroup[93] = tmp(15, 0);reggroup[94] = tmp(31,16);reggroup[95] = tmp(47,32);break; } } else{ runmode = tmp(55,48); isRunInstr = true; } } } void config(ap_int<16> reggroup[96],MXU_PARAM &mxuparam,RELPOOL_PARAM &poolparam,LDST_PARAM &lsdtparam, ap_int<32> norm_coef[32]){ #pragma HLS INLINE mxuparam.isload = reggroup[ 0].range(0,0); mxuparam.iscalc = reggroup[ 0].range(1,1); mxuparam.isping = reggroup[ 0].range(2,2); mxuparam.isfirstpsum = reggroup[ 0].range(3,3); mxuparam.weight_raddr = reggroup[ 1]; mxuparam.ubuf_raddr_start= reggroup[ 2]; mxuparam.ubuf_raddr_step1= reggroup[ 3]; mxuparam.ubuf_raddr_step2= reggroup[ 4]; mxuparam.ubuf_raddr_step3= reggroup[ 5]; mxuparam.ubuf_raddr_end1 = reggroup[ 6]; mxuparam.ubuf_raddr_end2 = reggroup[ 7]; mxuparam.ubuf_raddr_end3 = reggroup[ 8]; mxuparam.ubuf_raddr_num = reggroup[ 9]; mxuparam.psum_start = reggroup[10]; mxuparam.psum_step1 = reggroup[11]; mxuparam.psum_end1 = reggroup[12]; mxuparam.psum_step2 = reggroup[13]; poolparam.isrelu = reggroup[14].range( 0,0); poolparam.maxpool = reggroup[14].range( 1,1); poolparam.avg_shift = reggroup[14].range( 7,4); poolparam.pool_kw = reggroup[14].range(15,8); poolparam.pool_kh = reggroup[15].range( 7,0); poolparam.pool_w = reggroup[15].range(15,8); poolparam.pool_sw = reggroup[16].range( 7,0); poolparam.pool_sh = reggroup[16].range(15,8); poolparam.psum_raddr_start = reggroup[17]; poolparam.pool_cnt = reggroup[18]; poolparam.pool_h_step = reggroup[19]; poolparam.avg_val = reggroup[20]; poolparam.ubuf_waddr_start = reggroup[21]; poolparam.ubuf_waddr_step1 = reggroup[22]; poolparam.ubuf_waddr_step2 = reggroup[23]; poolparam.ubuf_waddr_step3 = reggroup[24]; poolparam.ubuf_waddr_end1 = reggroup[25]; poolparam.ubuf_waddr_end2 = reggroup[26]; poolparam.ubuf_waddr_end3 = reggroup[27]; lsdtparam.weight_addr = reggroup[28]; lsdtparam.weight_ldlen = reggroup[29]; ap_uint<32> tmp = (reggroup[31],reggroup[30]); lsdtparam.weight_offset = tmp; for(int i=0;i<32;i++){ #pragma HLS UNROLL norm_coef[i] = (reggroup[33+2*i],reggroup[32+2*i]); } return; } ================================================ FILE: src/mxu.cpp ================================================ #include "tpu.h" void SetWeight(WEIGHTDTYPE weight[512][MXU_COLNUM],WEIGHTDTYPE weightreg[MXU_ROWNUM+4][MXU_COLNUM], short weight_raddr, bool enable){ if(!enable) return; for(short i=weight_raddr;i=0;k--){ if(k>0) featreg[j][k] = featreg[j][k-1]; else if(i=0;j--){ for(int k=0;k biasreg; biasreg(31,24)=weightreg[MXU_ROWNUM+0][k]; biasreg(23,16)=weightreg[MXU_ROWNUM+1][k]; biasreg(15, 8)=weightreg[MXU_ROWNUM+2][k]; biasreg( 7, 0)=weightreg[MXU_ROWNUM+3][k]; if(j==0) psumreg[j][k] = featreg[j][k+j]*weightreg[j][k] + biasreg; else psumreg[j][k] = featreg[j][k+j]*weightreg[j][k] + psumreg[j-1][k]; } } #pragma HLS DEPENDENCE variable=psum inter false #pragma HLS DEPENDENCE variable=psum intra false for(int j=0;j=j+MXU_ROWNUM-1&&i norm_coef[MXU_COLNUM],RELPOOL_PARAM param, bool enable){ //#pragma HLS INTERFACE bram port=unified_buffer //#pragma HLS INTERFACE bram port=psum_buffer //#pragma HLS ARRAY_PARTITION variable=norm_coef complete dim=1 //#pragma HLS ARRAY_PARTITION variable=unified_buffer complete dim=2 //#pragma HLS ARRAY_PARTITION variable=psum_buffer complete dim=2 PSUMDTYPE psumreg[MXU_COLNUM]; PSUMDTYPE psumrelu[MXU_COLNUM]; PSUMDTYPE psumpool[MXU_COLNUM]; FEATDTYPE relu[MXU_COLNUM]; short pool[MXU_COLNUM]; #pragma HLS ARRAY_PARTITION variable=psumreg complete dim=1 #pragma HLS ARRAY_PARTITION variable=psumsht complete dim=1 #pragma HLS ARRAY_PARTITION variable=relu complete dim=1 #pragma HLS ARRAY_PARTITION variable=pool complete dim=1 char pool_kw_cnt = 0; char pool_kh_cnt = 0; char pool_w_cnt = 0; char pool_h_cnt = 0; short ubuf_waddr_p1=0; short ubuf_waddr_p2=0; short ubuf_waddr_p3=0; if(!enable) return; for(short i=0;ipsumpool[j]) psumpool[j] = psumrelu[j]; } else{ psumpool[j] = psumpool[j] + psumrelu[j]; } } if(pool_kw_cnt==param.pool_kw&&pool_kh_cnt==param.pool_kh){ short ubuf_waddr = param.ubuf_waddr_start + ubuf_waddr_p1 + ubuf_waddr_p2 + ubuf_waddr_p3; if(ubuf_waddr_p1==param.ubuf_waddr_end1){ if(ubuf_waddr_p2==param.ubuf_waddr_end2){ ubuf_waddr_p2 = 0; ubuf_waddr_p3 = ubuf_waddr_p3 + param.ubuf_waddr_step3; } else{ ubuf_waddr_p2 = ubuf_waddr_p2 + param.ubuf_waddr_step2; } } else{ ubuf_waddr_p1 = ubuf_waddr_p1 + param.ubuf_waddr_step1; } for(int j=0;j>32; ap_int<8> res; if(tmpcut>127) res = 127; else if(tmpcut<-128) res = -128; else res = tmpcut; unified_buffer[ubuf_waddr][j] = res; } } if(pool_kw_cnt==param.pool_kw){ pool_kw_cnt = 0; if(pool_kh_cnt==param.pool_kh){ pool_kh_cnt = 0; if(pool_w_cnt==param.pool_w){ pool_w_cnt = 0; pool_h_cnt = pool_h_cnt + param.pool_sh; } else{ pool_w_cnt = pool_w_cnt + param.pool_sw; } } else{ pool_kh_cnt = pool_kh_cnt + 1; } } else{ pool_kw_cnt = pool_kw_cnt + 1; } } } ================================================ FILE: src/tb_tpu.cpp ================================================ #include "tpu.h" #include "stdio.h" int main(){ ap_uint<256> *ddr; ap_uint<64> *ddr_instr; ddr = (ap_uint<256> *)malloc(sizeof(ap_uint<256>)*(16384)); //512*25+72*25+72+512 ddr_instr = (ap_uint<64> *)malloc(sizeof(ap_uint<64>)*3300); FILE *fid; fid = fopen("mlp_img.bin","rb"); fread(ddr,32,25*512,fid); fclose(fid); fid = fopen("mlp_param.bin","rb"); fread(ddr+512*25,32,25*72+72,fid); fclose(fid); fid = fopen("mlp_instr.bin","rb"); ap_uint<64> *ddr_instr_r = ddr_instr; int cnt = 0; while(1==1){ fread(ddr_instr_r,8,1,fid); ap_uint<64> tmp = *ddr_instr_r; if(tmp.range(55,55)==1) break; ddr_instr_r++; cnt++; } fclose(fid); tpu(ddr,ddr_instr); fid = fopen("golden_result.txt","r"); int err = 0; for(int i=0;i<512;i++){ ap_uint<256> val = ddr[512*25+72*25+72+i]; int maxcof = -255; int idx = -1; int ref = -1; for(int j=0;j<16;j++){ int cof = val(j*8+7,j*8); if(cof>127) cof = cof-256; if(cof>maxcof){ maxcof = cof; idx = j; } } fscanf(fid,"%d",&ref); if(idx!=ref) err++; } return err; } ================================================ FILE: src/tpu.cpp ================================================ #include "tpu.h" void ex_module(FEATDTYPE unified_buffer[16384][MXU_ROWNUM],WEIGHTDTYPE weight_buffer[512][MXU_COLNUM], ap_int<32> norm_coef[MXU_COLNUM],MXU_PARAM mxuparam,RELPOOL_PARAM poolparam, bool is_MXU,bool is_relu_norm_pool){ #pragma HLS INLINE off #pragma HLS DEPENDENCE variable=unified_buffer inter false #pragma HLS DEPENDENCE variable=unified_buffer intra false static PSUMDTYPE psum_buffer1[512][MXU_COLNUM]; static PSUMDTYPE psum_buffer2[512][MXU_COLNUM]; #pragma HLS ARRAY_PARTITION variable=psum_buffer1 complete dim=2 #pragma HLS ARRAY_PARTITION variable=psum_buffer2 complete dim=2 if((is_MXU&&mxuparam.psum_start<512) || (is_relu_norm_pool&&poolparam.psum_raddr_start>=512) ) { MXU(unified_buffer,weight_buffer,psum_buffer1,mxuparam,is_MXU); relu_norm_pool(psum_buffer2,unified_buffer,norm_coef,poolparam,is_relu_norm_pool); } else{ MXU(unified_buffer,weight_buffer,psum_buffer2,mxuparam,is_MXU); relu_norm_pool(psum_buffer1,unified_buffer,norm_coef,poolparam,is_relu_norm_pool); } } void tpu(ap_uint<256> *ddr,ap_uint<64> *ddr_instr){ #pragma HLS INTERFACE m_axi depth=16384 port=ddr #pragma HLS INTERFACE m_axi depth=3300 port=ddr_instr static FEATDTYPE unified_buffer[16384][MXU_ROWNUM]; #pragma HLS RESOURCE variable=unified_buffer core=RAM_S2P_BRAM static WEIGHTDTYPE weight_buffer[512][MXU_COLNUM]; #pragma HLS RESOURCE variable=weight_buffer core=RAM_S2P_BRAM static ap_int<32> norm_coef[MXU_COLNUM]; #pragma HLS ARRAY_PARTITION variable=unified_buffer complete dim=2 #pragma HLS ARRAY_PARTITION variable=weight_buffer complete dim=2 #pragma HLS ARRAY_PARTITION variable=norm_coef complete dim=0 ap_int<16> reggroup[96]; #pragma HLS ARRAY_PARTITION variable=reggroup complete dim=0 MXU_PARAM mxuparam; RELPOOL_PARAM poolparam; LDST_PARAM ldstparam; unsigned instr_offset = 0; bool is_load_weight; bool is_MXU; bool is_relu_norm_pool; // load img loadFeature(ddr,unified_buffer, 0,0, 512*25, true); bool eop = false; ap_int<8> runmode = 0; //0 nop, bit[0] loadweight;bit[1] mxu; bit[2] pool; bit[7] eop; instr(ddr_instr,instr_offset,reggroup,runmode,true); while(runmode[7]==0) { #pragma HLS DEPENDENCE variable=unified_buffer inter false #pragma HLS DEPENDENCE variable=unified_buffer intra false #pragma HLS DEPENDENCE variable=weight_buffer inter false #pragma HLS DEPENDENCE variable=weight_buffer intra false config(reggroup,mxuparam,poolparam,ldstparam,norm_coef); is_load_weight = runmode[0]==1; is_MXU = runmode[1]==1; is_relu_norm_pool = runmode[2]==1; instr(ddr_instr,instr_offset,reggroup,runmode,true); loadWeight(ddr,weight_buffer,ldstparam.weight_offset,ldstparam.weight_addr, ldstparam.weight_ldlen,is_load_weight); ex_module(unified_buffer,weight_buffer,norm_coef,mxuparam,poolparam,is_MXU,is_relu_norm_pool); } storeFeature(ddr,unified_buffer, 512*25+72*25+72,14000, 512, true); } ================================================ FILE: src/tpu.h ================================================ #include "ap_int.h" #define MXU_COLNUM 32 #define MXU_ROWNUM 32 #define WEIGHTDTYPE char #define FEATDTYPE char #define PSUMDTYPE ap_int<32> struct MXU_PARAM{ bool isload; bool iscalc; bool isping; bool isfirstpsum; short weight_raddr; short ubuf_raddr_start; short ubuf_raddr_step1; short ubuf_raddr_step2; short ubuf_raddr_step3; short ubuf_raddr_end1; short ubuf_raddr_end2; short ubuf_raddr_end3; short ubuf_raddr_num; short psum_start; short psum_step1; short psum_end1; short psum_step2; }; struct RELPOOL_PARAM{ bool isrelu; short psum_raddr_start; bool maxpool; // max pool or average pool char pool_kw; char pool_kh; char pool_w; char pool_sw; char pool_sh; short pool_cnt; // output_num*pool_kw*pool_kh short pool_h_step; short avg_val; ap_uint<4> avg_shift; short ubuf_waddr_start; short ubuf_waddr_step1; short ubuf_waddr_step2; short ubuf_waddr_step3; short ubuf_waddr_end1; short ubuf_waddr_end2; short ubuf_waddr_end3; }; struct LDST_PARAM{ unsigned weight_offset; short weight_addr; short weight_ldlen; }; void MXU(FEATDTYPE ubuf[16384][MXU_ROWNUM],WEIGHTDTYPE weight[512][MXU_COLNUM], PSUMDTYPE psum[512][MXU_COLNUM],MXU_PARAM mxuparam, bool enable); void relu_norm_pool(PSUMDTYPE psum_buffer[512][MXU_COLNUM],FEATDTYPE unified_buffer[16384][MXU_ROWNUM], ap_int<32> norm_coef[MXU_COLNUM],RELPOOL_PARAM param, bool enable); void loadWeight(ap_uint<256> *ddr,WEIGHTDTYPE weight_buffer[512][MXU_COLNUM], unsigned offset,short addr, short len, bool enable); void loadFeature(ap_uint<256> *ddr,FEATDTYPE unified_buffer[512][MXU_ROWNUM], unsigned offset,short addr, short len, bool enable); void storeFeature(ap_uint<256> *ddr,FEATDTYPE unified_buffer[512][MXU_COLNUM], unsigned offset,short addr, short len, bool enable); void instr(ap_uint<64> *ddr,unsigned &offset,ap_int<16> reggroup[96],ap_int<8> &runmode,bool enable); void config(ap_int<16> reggroup[96],MXU_PARAM &mxuparam,RELPOOL_PARAM &poolparam, LDST_PARAM &lsdtparam, ap_int<32> norm_coef[32]); void tpu(ap_uint<256> *ddr,ap_uint<64> *ddr_instr);