Skidbuffer in Acton

Mit meinem Artikel “Minimaler Skidbuffer mit AXI-like Handshaking” habe ich bereits die Grundlage gelegt. Jetzt habe ich das ganze in Aktion zusammen mit meinen AXI-Pipeline Modulen auf einem älteren Spartan-3E FPGA getestet.

Ziel war es eine Art “Performance-Benchmark” zu erstellen, um die Effizienz des Skidbuffers in einer realen AXI-Pipeline zu demonstrieren. Dafür habe ich ein AXI-Pipeline Modul erstellt, welches dynamisch zwischen “Mit Skidbuffer” und “Ohne Skidbuffer” umschalten kann. Das ermöglicht es, die Auswirkungen des Skidbuffers auf die erreichbare Taktfrequenz zu messen.

AXI-Pipeline Module

Folgend der Code des AXI-Pipeline Moduls, das den Skidbuffer integriert:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;

entity Pipeline_pb_Module is
    generic (
        --@ Number of pipeline stages
        G_PipelineStages       : integer := 10;
        --@ Data width
        G_Width                : integer := 32;
        --@ Register balancing attribute<br>
        --@ - "no" : No register balancing <br>
        --@ - "yes": Register balancing in both directions <br>
        --@ - "forward": Moves a set of FFs at the inputs of a LUT to a single FF at its output. <br>
        --@ - "backward": Moves a single FF at the output of a LUT to a set of FFs at its inputs.
        G_RegisterBalancing    : string  := "no";
        --@ Enable pipeline buffer
        --@ - true  : Use pipeline buffer
        --@ - false : Direct connection (bypass)
        G_EnablePipelineBuffer : boolean := false
        );
    port (
        I_CLK   : in  std_logic;
        I_RST   : in  std_logic;
        I_CE    : in  std_logic;
        ---
        I_Data  : in  std_logic_vector(G_Width - 1 downto 0);
        I_Valid : in  std_logic;
        O_Ready : out std_logic;
        ---
        O_Data  : out std_logic_vector(G_Width - 1 downto 0);
        O_Valid : out std_logic;
        I_Ready : in  std_logic
        );
end entity Pipeline_pb_Module;

architecture RTL of Pipeline_pb_Module is
    signal C_PipelineEnable       : std_logic;
    signal C_PipelineBufferEnable : std_logic_vector(1 downto 0) := (others => '0');

    signal R_Valid : std_logic;
    signal R_Ready : std_logic;
    signal R_Data  : std_logic_vector(G_Width - 1 downto 0);
    signal C_Data  : std_logic_vector(G_Width - 1 downto 0);
begin
    INST_PipelineControllerIn : entity work.PipelineController
        generic map(
            G_PipelineStages => G_PipelineStages,
            G_ResetActiveAt  => '1'
            )
        port map(
            I_CLK    => I_CLK,
            I_RST    => I_RST,
            I_CE     => I_CE,
            O_Enable => C_PipelineEnable,
            I_Valid  => I_Valid,
            O_Ready  => O_Ready,
            O_Valid  => R_Valid,
            I_Ready  => R_Ready
            );

    INST_PipelineRegisterIn : entity work.PipelineRegister
        generic map(
            G_PipelineStages    => G_PipelineStages,
            G_Width             => G_Width,
            G_RegisterBalancing => G_RegisterBalancing
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => C_PipelineEnable,
            I_Data   => I_Data,
            O_Data   => R_Data
            );

    ---------

    C_Data <= std_logic_vector(unsigned(R_Data) + 3);  -- Example operation, can be replaced with actual logic

    ---------

    -- Pipeline Buffer Generation based on G_EnablePipelineBuffer
    GEN_PipelineBuffer : if G_EnablePipelineBuffer generate
        INST_PipelineBufferController : entity work.PipelineBufferController
            generic map(
                G_ResetActiveAt => '1'
                )
            port map(
                I_CLK    => I_CLK,
                I_RST    => I_RST,
                I_CE     => I_CE,
                O_Enable => C_PipelineBufferEnable,
                I_Valid  => R_Valid,
                O_Ready  => R_Ready,
                O_Valid  => O_Valid,
                I_Ready  => I_Ready
                );

        INST_PipelineBuffer : entity work.PipelineBuffer
            generic map(
                G_Width => G_Width
                )
            port map(
                I_CLK    => I_CLK,
                I_Enable => C_PipelineBufferEnable,
                I_Data   => C_Data,
                O_Data   => O_Data
                );
    end generate GEN_PipelineBuffer;

    -- Direct connection when pipeline buffer is disabled
    GEN_PassthroughWithoutBuffer : if not G_EnablePipelineBuffer generate
        O_Valid <= R_Valid;
        O_Data  <= R_Data;
        R_Ready <= I_Ready;
    end generate GEN_PassthroughWithoutBuffer;

end architecture RTL;

Performance Benchmark

Das Modul habe ich dann in einem Performance-Benchmark (Synthesefähig) getestet, um die erreichbare Taktfrequenz zu messen. Dabei konnte ich eine maximale Taktfrequenz von 270 MHz erreichen - mit bis zu 250 Modulen in einer Pipeline.

Der Performance-Benchmark sieht wie folgt aus:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;

entity Pipeline_pb is
    generic (
        --@ Number of pipeline stages inside each module
        G_PipelineStages       : integer := 2;
        --@ Data width
        G_Width                : integer := 8;
        --@ Register balancing attribute<br>
        --@ - "no" : No register balancing <br>
        --@ - "yes": Register balancing in both directions <br>
        --@ - "forward": Moves a set of FFs at the inputs of a LUT to a single FF at its output. <br>
        --@ - "backward": Moves a single FF at the output of a LUT to a set of FFs at its inputs.
        G_RegisterBalancing    : string  := "yes";
        --@ Enable pipeline buffer
        --@ - true  : Use pipeline buffer
        --@ - false : Direct connection (bypass)
        G_EnablePipelineBuffer : boolean := true;
        --@ How many Pipeline modules shall be chained?
        G_PipelineModules      : integer := 250;
        --@ Enable chip enable signal
        G_Enable_CE            : boolean := false;
        --@ Enable reset signal
        G_Enable_RST           : boolean := false
        );
    port (
        I_CLK   : in  std_logic;
        I_RST   : in  std_logic;
        I_CE    : in  std_logic;
        ---
        I_Data  : in  std_logic_vector(G_Width - 1 downto 0);
        I_Valid : in  std_logic;
        O_Ready : out std_logic;
        ---
        O_Data  : out std_logic_vector(G_Width - 1 downto 0);
        O_Valid : out std_logic;
        I_Ready : in  std_logic
        );
end entity Pipeline_pb;

architecture RTL of Pipeline_pb is
    ---------------------------------------------------------------------------
    -- Attribute helpers
    ---------------------------------------------------------------------------
    attribute keep : string;
    attribute IOB  : string;

    ---------------------------------------------------------------------------
    -- Bench‐wrapper FFs (synchronous IO)
    ---------------------------------------------------------------------------
    signal R_RST                  : std_logic := '0';
    signal R_CE                   : std_logic := '1';
    attribute keep of R_RST, R_CE : signal is "true";
    attribute IOB of R_RST, R_CE  : signal is "false";

    signal R_DataIn                       : std_logic_vector(G_Width-1 downto 0);
    signal R_ValidIn                      : std_logic;
    attribute keep of R_DataIn, R_ValidIn : signal is "true";
    attribute IOB of R_DataIn, R_ValidIn  : signal is "false";

    signal R_DataOut                                   : std_logic_vector(G_Width-1 downto 0);
    signal R_ValidOut                                  : std_logic;
    signal R_ReadyIn                                   : std_logic;
    attribute keep of R_DataOut, R_ValidOut, R_ReadyIn : signal is "true";
    attribute IOB of R_DataOut, R_ValidOut, R_ReadyIn  : signal is "false";

    ---------------------------------------------------------------------------
    -- Chaining arrays (sentinel element @0 and @G_PipelineModules)
    ---------------------------------------------------------------------------
    type T_DataArray is array(0 to G_PipelineModules) of std_logic_vector(G_Width-1 downto 0);

    signal S_Data  : T_DataArray;
    signal S_Valid : std_logic_vector(0 to G_PipelineModules);
    signal S_Ready : std_logic_vector(0 to G_PipelineModules);

begin
    GEN_Enable_CE : if G_Enable_CE = true generate
        process(I_CLK)
        begin
            if rising_edge(I_CLK) then
                R_CE <= I_CE;
            end if;
        end process;
    end generate GEN_Enable_CE;

    GEN_Enable_RST : if G_Enable_RST = true generate
        process(I_CLK)
        begin
            if rising_edge(I_CLK) then
                R_RST <= I_RST;
            end if;
        end process;
    end generate GEN_Enable_RST;

    -----------------------------------------------------------------------
    -- Wrapper FFs: register all top‑level ports once for fair timing
    -----------------------------------------------------------------------
    BenchFF : process(I_CLK)
    begin
        if rising_edge(I_CLK) then
            --- Register inputs
            R_DataIn   <= I_Data;
            R_ValidIn  <= I_Valid;
            O_Ready    <= S_Ready(0);
            --- Register outputs
            R_DataOut  <= S_Data (G_PipelineModules);
            R_ValidOut <= S_Valid(G_PipelineModules);
            R_ReadyIn  <= I_Ready;
        end if;
    end process;

    O_Data  <= R_DataOut;
    O_Valid <= R_ValidOut;

    -----------------------------------------------------------------------
    -- Bind sentinel 0 with registered inputs
    -----------------------------------------------------------------------
    S_Data (0) <= R_DataIn;
    S_Valid(0) <= R_ValidIn;

    -----------------------------------------------------------------------
    -- Bind last sentinel with registered outputs
    -----------------------------------------------------------------------
    S_Ready(G_PipelineModules) <= R_ReadyIn;

    -----------------------------------------------------------------------
    -- Generate N pipeline modules in series
    -----------------------------------------------------------------------
    gen_modules : for i in 0 to G_PipelineModules-1 generate

        P_MOD : entity work.Pipeline_pb_Module
            generic map(
                G_PipelineStages       => G_PipelineStages,
                G_Width                => G_Width,
                G_RegisterBalancing    => G_RegisterBalancing,
                G_EnablePipelineBuffer => G_EnablePipelineBuffer
                )
            port map(
                I_CLK   => I_CLK,
                I_RST   => R_RST,
                I_CE    => R_CE,
                -- Up‑stream side
                I_Data  => S_Data (i),
                I_Valid => S_Valid(i),
                O_Ready => S_Ready(i),
                -- Down‑stream side
                O_Data  => S_Data (i+1),
                O_Valid => S_Valid(i+1),
                I_Ready => S_Ready(i+1)
                );

    end generate gen_modules;

end architecture RTL;

Ergebnisse

Durch die Auftrennung des Ready-Signals mittels Skidbuffer entfällt dessen Propagierung durch die gesamte Pipeline. Valid ist dabei sowieso vollkommen durch Register entkoppelt. Dadurch wird jedes Modul hinsichtlich des Timings unabhängig von den anderen Modulen. Unabhängig davon, wie viele Module in der Pipeline sind, bleibt die erreichbare Taktfrequenz konstant bei 270 MHz.

Das ganze ist auch in der hinsicht Testbar, dass ich den Skidbuffer in der AXI-Pipeline dynamisch aktivieren und deaktivieren kann. Dadurch kann ich die Auswirkungen des Skidbuffers auf die erreichbare Taktfrequenz messen. Ohne diesen sind bei 250 Modulen nicht mal mehr 100 MHz möglich.