本文在前文代码的基础上进行拓展,通过引入40G以太网子系统,实现万兆网卡的接口带宽要求。本文主要对功能进行实现,整体结构简化如下:
FPGA使用Xilinx Ultrascale+ VCU128开发板,操作系统使用Ubuntu20.04,使用到的驱动代码与RHEL8.8(RedHat)兼容。本文主要介绍以太网子系统(Ethernet Subsystem)部分的基本功能实现,其他部分及驱动程序的实现可参照基于FPGA的千兆网卡实现(一)——回环测试,本文为FPGA代码实现、上板测试两个部分。
FPGA代码实现
40G以太网子系统IP核配置及使用
由于PCIe相关模块数据位宽为128位,为了简便起见,这里使用128bit跨接(Straddle)AXIS接口进行数据的收发过程。
由于接口速率提升,在内部时钟频率保持不变的情况下,AXIS的数据位宽会变大,这意味着如果某个时钟周期只单纯发送单个数据包的结尾,数据包间的延时会变大。
为此40G以太网引入了跨接AXIS4接口,跨接AXIS4接口类似于普通AXIS接口,它通过引入额外的tuser字段标识,支持在一个时钟周期内发送或接收两个相邻数据包的开头部分(不超过64bit)与结尾部分(不超过64bit)。
tuser[69:0]字段分为以下几个子字段,个人理解总结如下,具体含义可查看手册
tuser字段随tvalid与tready字段搭配使用,用于标识tdata的有效字节。下面有两张图便于理解文字描述,第一张图描述单个数据包的发送/接收过程,数据包末尾只发送9个字节有效数据,mty0为0代表低64bit有效,mty1为7代表高56bit无效。
第二张图描述两个背靠背数据包的发送/接收过程,解释类似上图。
n2s/s2n模块
在本次实现中,暂时不考虑两个数据包头尾位于同个时钟周期的跨接情况,标准AXIS到跨接AXIS的主要代码如下:
always_comb begin
n2s_axis_tx_tvalid = eth_axis_tx_tvalid;
n2s_axis_tx_tlast = eth_axis_tx_tlast;
n2s_axis_tx_tdata = eth_axis_tx_tdata;
if (~tx_packet_unfinish_r & n2s_axis_tx_tvalid) begin
n2s_axis_tx_tuser[57] = 1'b1; // sop0
end else begin
n2s_axis_tx_tuser[57] = 1'b0; // sop0
end
eth_axis_tx_tready = n2s_axis_tx_tready;
end
always_comb begin
if (eth_axis_tx_tvalid) begin
case (eth_axis_tx_tkeep[15:0])
16'hffff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h7fff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h3fff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h1fff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h0fff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h07ff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h03ff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h01ff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b11; // ena1 ena0
16'h00ff: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h007f: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h003f: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h001f: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h000f: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h0007: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h0003: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
16'h0001: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b01; // ena1 ena0
default: {n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b00; // ena1 ena0 couldnt happened
endcase
end else begin
{n2s_axis_tx_tuser[63], n2s_axis_tx_tuser[56]} = 2'b00; // ena1 ena0
end
if (eth_axis_tx_tvalid & eth_axis_tx_tlast) begin
case (eth_axis_tx_tkeep[15:0])
16'hffff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h7fff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h3fff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h1fff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h0fff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h07ff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h03ff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h01ff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b10; // eop1 eop0
16'h00ff: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h007f: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h003f: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h001f: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h000f: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h0007: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h0003: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
16'h0001: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b01; // eop1 eop0
default: {n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b00; // ena1 ena0 couldnt happened
endcase
end else begin
{n2s_axis_tx_tuser[65], n2s_axis_tx_tuser[58]} = 2'b00; // eop1 eop0
end
case (eth_axis_tx_tkeep[15:0])
16'hffff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000000; // mty1 mty0
16'h7fff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b001000; // mty1 mty0
16'h3fff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b010000; // mty1 mty0
16'h1fff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b011000; // mty1 mty0
16'h0fff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b100000; // mty1 mty0
16'h07ff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b101000; // mty1 mty0
16'h03ff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b110000; // mty1 mty0
16'h01ff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b111000; // mty1 mty0
16'h00ff: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000000; // mty1 mty0
16'h007f: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000001; // mty1 mty0
16'h003f: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000010; // mty1 mty0
16'h001f: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000011; // mty1 mty0
16'h000f: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000100; // mty1 mty0
16'h0007: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000101; // mty1 mty0
16'h0003: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000110; // mty1 mty0
16'h0001: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 6'b000111; // mty1 mty0
default: {n2s_axis_tx_tuser[68:66], n2s_axis_tx_tuser[61:59]} = 2'b00; // ena1 ena0 couldnt happened
endcase
n2s_axis_tx_tuser[64] = 1'b0; // sop1
n2s_axis_tx_tuser[69] = 1'b0; // err1
n2s_axis_tx_tuser[62] = 1'b0; // err0
n2s_axis_tx_tuser[55:0] = {7{8'h55}}; // preamblein
end
跨接AXIS到标准AXIS的主要代码如下:
always_comb begin
s2n_axis_rx_tdata = eth_axis_rx_tdata_padding;
if (s2n_axis_rx_prog_tready & eth_axis_rx_tvalid_padding & eth_axis_rx_tuser_padding[57]) begin
s2n_axis_rx_tvalid = 1'b1;
end else begin
s2n_axis_rx_tvalid = packet_unfinish_r;
end
if (eth_axis_rx_tuser_padding[65] | eth_axis_rx_tuser_padding[58]) begin
s2n_axis_rx_tlast = 1'b1;
end else begin
s2n_axis_rx_tlast = 1'b0;
end
case ({eth_axis_rx_tuser_padding[58] & ~eth_axis_rx_tuser_padding[64], eth_axis_rx_tuser_padding[68:66]})
4'b0000: s2n_axis_rx_tkeep[15:8] = 8'hff;
4'b0001: s2n_axis_rx_tkeep[15:8] = 8'h7f;
4'b0010: s2n_axis_rx_tkeep[15:8] = 8'h3f;
4'b0011: s2n_axis_rx_tkeep[15:8] = 8'h1f;
4'b0100: s2n_axis_rx_tkeep[15:8] = 8'h0f;
4'b0101: s2n_axis_rx_tkeep[15:8] = 8'h07;
4'b0110: s2n_axis_rx_tkeep[15:8] = 8'h03;
4'b0111: s2n_axis_rx_tkeep[15:8] = 8'h01;
default: s2n_axis_rx_tkeep[15:8] = 8'h00;
endcase
case ({1'b1, eth_axis_rx_tuser_padding[61:59]}) // have bugs at packet_unfinish_r
4'b1000: s2n_axis_rx_tkeep[7:0] = 8'hff;
4'b1001: s2n_axis_rx_tkeep[7:0] = 8'h7f;
4'b1010: s2n_axis_rx_tkeep[7:0] = 8'h3f;
4'b1011: s2n_axis_rx_tkeep[7:0] = 8'h1f;
4'b1100: s2n_axis_rx_tkeep[7:0] = 8'h0f;
4'b1101: s2n_axis_rx_tkeep[7:0] = 8'h07;
4'b1110: s2n_axis_rx_tkeep[7:0] = 8'h03;
4'b1111: s2n_axis_rx_tkeep[7:0] = 8'h01;
default: s2n_axis_rx_tkeep[7:0] = 8'h00;
endcase
end
跨时钟域模块
由于40G以太网部分用户时钟为312.25MHz,PCIe部分用户时钟为250MHz,这里使用前文实现的异步FWFT FIFO进行跨时钟数据传输。例化接口如下。
async_fifo_huge #(
.TDATA_WIDTH(145),
.FIFO_DEPTH(2)
) tx_async_fifo_huge_inst(
.m_clk(pcie_clk_250MHz),
.s_clk(n2s_axis_tx_clk),
.m_axis_tdata({s_axis_tx_tlast, s_axis_tx_tkeep[15:0], s_axis_tx_tdata[127:0]}),
.wr_en(s_axis_tx_tvalid),
.wfull(tx_async_fifo_huge_wfull),
.s_axis_tdata({cdc_axis_tx_tlast, cdc_axis_tx_tkeep, cdc_axis_tx_tdata}),
.rempty(tx_async_fifo_huge_rempty),
.rd_en (cdc_axis_tx_tready)
);
对于发送部分逻辑而言,由于以太网接口输入AXIS要求数据包开始发送(即tvalid有效)后连续在发送数据包直到数据包完整结束(即tlast置位),为此这里使用前文实现的AXIS PACKET FIFO进行数据包缓存,即等到输入端一个完整数据包收到后,再向输出端输出数据。这里假设单个数据包大小(即MTU)不超过1500字节,对应128bit位宽FIFO深度约为94,这里取整为1024。例化接口如下。
axis_fifo_packet #( // send packet as fifo packet mode, max support 1500 bytes packet
.TDATA_WIDTH(144),
.FIFO_DEPTH(1024)
) tx_axis_fifo_packet_inst(
.clk (n2s_axis_tx_clk ) ,
.m_axis_tdata ({cdc_axis_tx_tkeep, cdc_axis_tx_tdata} ) ,
.m_axis_tvalid (cdc_axis_tx_tvalid) ,
.m_axis_tready (cdc_axis_tx_tready) ,
.m_axis_tlast (cdc_axis_tx_tlast ) ,
.s_axis_tdata ({eth_axis_tx_tkeep, eth_axis_tx_tdata} ) ,
.s_axis_tvalid (eth_axis_tx_tvalid) ,
.s_axis_tready (eth_axis_tx_tready) ,
.s_axis_tlast (eth_axis_tx_tlast)
);
对于接收部分逻辑而言,由于PCIe侧目前存在待优化的部分,无法保证随时能够接收数据包,为了防止数据包写入途中FIFO变满无法写入完整数据包,需要使用prog_wfull保证FIFO剩余空间大小能够容纳MTU大小数据包,进而保证PCIe部分逻辑每次能够接收到完整的数据包。
具体做法是,在以太网接收端每次收到数据包开始或结束标志时判断当前prog_full是否置位,若没有置位下次即重复判断又写数据包,否则下次仅判断不写数据包。例化接口如下。
axis_fifo_whole #(
.TDATA_WIDTH(145),
.FIFO_DEPTH(1024),
.PROG_FULL_DEPTH(100) // when fifo rest room less than 100, the prog_wfull will assert 255) //
) axis_fifo_whole_inst(
.clk(eth_axis_rx_clk),
.m_axis_tdata({s2n_axis_rx_tlast, s2n_axis_rx_tkeep[15:0], s2n_axis_rx_tdata[127:0]}),
.wr_en(s2n_axis_rx_tvalid),
.wfull(axis_fifo_whole_wfull),
.prog_wfull(axis_fifo_whole_prog_wfull),
.s_axis_tdata({cdc_axis_rx_tlast, cdc_axis_rx_tkeep[15:0], cdc_axis_rx_tdata[127:0]}),
.rempty(axis_fifo_whole_rempty),
.rd_en(cdc_axis_rx_tready)
);
跨SLR模块
对于VCU128开发板,其PCIe接口位于SLR0的右侧,QSFP接口位于SLR2的左侧,为了防止时序违例,需要在跨SLR处添加寄存器。例化接口如下。
axis_slr_register #(
.TDATA_WIDTH(145),
.FIFO_DEPTH(1024)
) send_axis_slr_register_inst(
.clk(clk),
.m_axis_tdata({send_axis_tlast, send_axis_tkeep, send_axis_tdata}),
.wr_en (send_axis_tvalid),
.wfull (send_axis_slr_register_wfull),
.s_axis_tdata({send_axis_slr_tlast, send_axis_slr_tkeep, send_axis_slr_tdata}),
.rempty (send_axis_slr_register_rempty),
.rd_en (send_axis_slr_tready)
);
上板测试
本文在两台FPGA上烧录了比特流,并使用两台装有Ubuntu20.04的PC机A机和B机进行测试,首先通过ifconfig设置每台机器的ip地址,这里系统给两台FPGA分配的网卡名称均为eth0。
sudo ifconfig eth0 77.6.10.68 # A机执行指令以
sudo ifconfig eth0 77.6.10.69 # B机执行指令以设置
ping测试结果如下,可见ping过程双方主机均能收到数据包,但延时非常非常非常大,在软件与硬件上可能都存在优化空间。相比之下,使用集成网卡直接ping的延时在1.5毫秒左右。
此外,主机ping自身IP地址所测时延如下,根据几次ping结果推测,时延主要来自驱动程序软件处理延时,是后续优化的主要部分。
参考链接
https://docs.amd.com/r/en-US/pg211-50g-ethernet
完整代码
完整代码可于同名公众号回复NIC_40G_simple下载。
标签:tuser,网卡,FPGA,tx,16,rx,功能测试,n2s,axis From: https://blog.csdn.net/qq_45434284/article/details/140451678