Featured image of post Optimizing the Pipeline

Optimizing the Pipeline

Optimization of a VHDL pipeline for calculating the address offset and visibility of a sprite.

Today I’m focusing on optimizing a small pipeline I created for a project. The pipeline is responsible for calculating the address offset of a sprite and checking whether it’s visible in the currently requested line.

So far, I’ve relied on register rebalancing to do the job. This does work, but of course, there’s still room for optimization.

Here’s the current code as a starting point:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;
use work.SpriteRom.all;

entity VerticalSpritePipeline is
    generic (
        --@ Width of the Y position (Line) register
        G_Y_Width        : integer := 10;
        --@ The height of the sprite in pixels
        G_Sprite_Height  : integer := 16;
        --@ Width of the sprite offset (Line address) register
        G_Offset_Width   : integer := 8;
        --@ The pipeline stages for the calculating pipeline (multiply by 2 for the the latency of the pipeline)
        G_PipelineStages : integer := 2
        );
    port (
        --@ Clock signal; (**Rising edge** triggered)
        I_CLK : in std_logic := '0';
        --@ Clock enable signal (**Active high**)
        I_CE  : in std_logic := '1';

        --@ @virtualbus VSpritePipeline-OP @dir In Vertical sprite pipeline operation interfacee
        --@ AXI like ready; (**Synchronous**, **Active high**)
        O_VSpritePipeline_OP_Ready     : out std_logic                                := '0';
        --@ AXI like valid; (**Synchronous**, **Active high**)
        I_VSpritePipeline_OP_Valid     : in  std_logic                                := '0';
        --@ The line to check if the sprite is in the line visible.
        I_VSpritePipeline_OP_Y_Request : in  std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
        --@ The sprite Y position to check if the sprite is in the line visible.
        I_VSpritePipeline_OP_Y_Sprite  : in  std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
        --@ @end

        --@ @virtualbus VSpritePipeline-Result @dir Out Vertical sprite pipeline result interface
        --@ AXI like ready; (**Synchronous**, **Active high**)
        I_VSpritePipeline_Ready     : in  std_logic                                     := '0';
        --@ AXI like valid; (**Synchronous**, **Active high**)
        O_VSpritePipeline_Valid     : out std_logic                                     := '0';
        --@ Indicates if the sprite is visible in the line.
        O_VSpritePipeline_IsVisible : out std_logic                                     := '0';
        --@ The calculated offset address of the sprite.
        O_VSpritePipeline_Offset    : out std_logic_vector(G_Offset_Width - 1 downto 0) := (others => '0')
     --@ @end
        );
end entity VerticalSpritePipeline;

architecture Rtl of VerticalSpritePipeline is
    --@ Line to check if the sprite is in the line visible
    signal R_Y_Request : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
    --@ The sprite Y position to check if the sprite is in the line visible
    signal R_Y_Sprite  : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');

    --@ Calculated visibility signal
    signal C_IsVisible : std_logic                                     := '0';
    --@ The calculated offset address of the sprite
    signal C_Offset    : std_logic_vector(G_Offset_Width - 1 downto 0) := (others => '0');

    --@ Pipeline enable signal
    signal S_CalculatingPipeline_Enable : std_logic := '0';
begin

    --@ Pipeline controller for the calculating pipeline
    I_CalculatingPipelineCtrl : entity work.PipelineController
        generic map(
            G_PipelineStages => G_PipelineStages * 2
            )
        port map(
            I_CLK    => I_CLK,
            I_CE     => I_CE,
            O_Enable => S_CalculatingPipeline_Enable,
            I_Valid  => I_VSpritePipeline_OP_Valid,
            O_Ready  => O_VSpritePipeline_OP_Ready,
            O_Valid  => O_VSpritePipeline_Valid,
            I_Ready  => I_VSpritePipeline_Ready
            );

    --@ Input register for the Y position of the sprite
    I_Y_InputRegister : entity work.PipelineRegister
        generic map(
            G_PipelineStages    => G_PipelineStages,
            G_Width             => G_Y_Width,
            G_RegisterBalancing => "forward"
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => S_CalculatingPipeline_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Sprite,
            O_Data   => R_Y_Sprite
            );

    --@ Input register for the line to check if the sprite is in the line visible
    I_YToCheck_InputRegister : entity work.PipelineRegister
        generic map(
            G_PipelineStages    => G_PipelineStages,
            G_Width             => G_Y_Width,
            G_RegisterBalancing => "forward"
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => S_CalculatingPipeline_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Request,
            O_Data   => R_Y_Request
            );

    --@ Combinatory process to calculate the visibility and offset of the sprite.
    P_CalculateVisibility : process (R_Y_Sprite, R_Y_Request)
        variable V_Y_Sprite      : unsigned(R_Y_Sprite'range);
        variable V_Y_Request     : unsigned(R_Y_Request'range);
        variable V_SpriteYBottom : unsigned(R_Y_Sprite'range);
        variable V_OffsetLine    : integer;
        variable V_Offset        : unsigned(C_Offset'range);
    begin
        V_Y_Sprite      := unsigned(R_Y_Sprite);
        V_Y_Request     := unsigned(R_Y_Request);
        V_SpriteYBottom := V_Y_Sprite + to_unsigned(G_Sprite_Height - 1, R_Y_Sprite'length);

        if V_Y_Request >= V_Y_Sprite and
            V_Y_Request <= V_SpriteYBottom then
            C_IsVisible <= '1';
        else
            C_IsVisible <= '0';
        end if;

        V_OffsetLine := to_integer(V_Y_Request - V_Y_Sprite);
        -- pragma translate_off
        if V_OffsetLine < 0 or V_OffsetLine >= K_SPRITE_ROW_OFFSETS'length then
            V_OffsetLine := 0;
        end if;
        -- pragma translate_on

        V_Offset := to_unsigned(K_SPRITE_ROW_OFFSETS(V_OffsetLine), C_Offset'length);
        C_Offset <= std_logic_vector(V_Offset);
    end process;

--@ Output register for the visibility of the sprite
    I_IsVisible_OutputRegister : entity work.PipelineRegister
        generic map(
            G_PipelineStages    => G_PipelineStages,
            G_Width             => 1,
            G_RegisterBalancing => "backward"
            )
        port map(
            I_CLK     => I_CLK,
            I_Enable  => S_CalculatingPipeline_Enable,
            I_Data(0) => C_IsVisible,
            O_Data(0) => O_VSpritePipeline_IsVisible
            );

--@ Output register for the offset of the sprite
    I_Offset_OutputRegister : entity work.PipelineRegister
        generic map(
            G_PipelineStages    => G_PipelineStages,
            G_Width             => G_Offset_Width,
            G_RegisterBalancing => "backward"
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => S_CalculatingPipeline_Enable,
            I_Data   => C_Offset,
            O_Data   => O_VSpritePipeline_Offset
            );
end architecture;

As you can see, the splitting of the computation is currently left entirely to the synthesis. Now it’s about finding a performance-optimized breakdown of the logic.

Within the calculation process, we find two relevant calculations and one comparison:

  1. Calculate the bottom edge of the sprite:

    1
    
    V_SpriteYBottom := V_SpriteY + to_unsigned(G_Sprite_Height - 1, R_Y_Sprite'length);
    
  2. Visibility check:

    1
    2
    
    if V_YToCheck >= V_SpriteY and
       V_YToCheck <= V_SpriteYBottom then
    
  3. Calculate the address offset:

    1
    
    V_OffsetLine := to_integer(V_YToCheck - V_SpriteY);
    

Since operations 2 and 3 depend on the result of the first, we’ll perform that one in a separate stage.

To prepare, we’ll first refactor the registers. We’ll rename them and switch from dynamically to statically defined pipeline stages.

We modify the pipeline controller as follows:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
    INST_VSpritePipeline_Ctrl : entity work.PipelineController
        generic map(
            G_PipelineStages => 3       -- TODO
            )
        port map(
            I_CLK    => I_CLK,
            I_CE     => I_CE,
            O_Enable => O_VSpritePipelineCtrl_Enable,
            I_Valid  => I_VSpritePipeline_OP_Valid,
            O_Ready  => O_VSpritePipeline_OP_Ready,
            O_Valid  => O_VSpritePipeline_Valid,
            I_Ready  => I_VSpritePipeline_Ready
            );    

We also reduce the input registers to a single stage:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
    INST0_VSpritePipeline_Y_Sprite : entity work.PipelineRegister
        generic map(
            G_Width             => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Sprite,
            O_Data   => R0_Y_Sprite
            );

    INST0_VSpritePipeline_Y_Request : entity work.PipelineRegister
        generic map(
            G_Width             => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Request,
            O_Data   => R0_Y_Request
            );

We introduce new combinatory and registered signals C_Y_Bottom_Sprite and R_Y_Bottom_Sprite and calculate the first stage using a concurrent signal assignment to avoid embedding this logic in a process:

1
2
3
4
    --@ Calculate the bottom Y position of the sprite
    C_Y_Bottom_Sprite <= std_logic_vector(
        unsigned(R_Y_Sprite) + to_unsigned(G_Sprite_Height - 1, G_Y_Width)
        );

And we need a register for the result:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
    INST_VSpritePipeline_Y_Bottom_Sprite : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => C_Y_Bottom_Sprite,
            O_Data   => R_Y_Bottom_Sprite
            )

To carry forward base values, we introduce additional registers:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
    INST1_VSpritePipeline_Y_Sprite : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => R0_Y_Sprite,
            O_Data   => R1_Y_Sprite
            );

    INST1_VSpritePipeline_Y_Request : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => R0_Y_Request,
            O_Data   => R1_Y_Request
            );

With the registered signals R1_Y_Sprite, R1_Y_Request, and R_Y_Bottom_Sprite in the next stage, we can now perform operations 2 and 3 in parallel:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
    --@ Calculate the visibility of the sprite
    C_IsVisible                 <= '1' when (
        (unsigned(R1_Y_Request) >= unsigned(R1_Y_Sprite)) and
        (unsigned(R1_Y_Request) <= unsigned(R_Y_Bottom_Sprite))
        ) else '0';

    --@ Calculate the offset address of the sprite
    C_Offset <= std_logic_vector(
        to_unsigned(
            K_SPRITE_ROW_OFFSETS(to_integer(unsigned(R1_Y_Request) - unsigned(R1_Y_Sprite))),
            C_Offset'length)
        );

Finally, we store these values in the output registers:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
    INST_IsVisible_OutputRegister : entity work.PipelineRegister
        generic map(
            G_Width => 1
            )
        port map(
            I_CLK     => I_CLK,
            I_Enable  => O_VSpritePipelineCtrl_Enable,
            I_Data(0) => C_IsVisible,
            O_Data(0) => O_VSpritePipeline_IsVisible
            );

    INST_Offset_OutputRegister : entity work.PipelineRegister
        generic map(
            G_Width => G_Offset_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipelineCtrl_Enable,
            I_Data   => C_Offset,
            O_Data   => O_VSpritePipeline_Offset
            );

The only remaining change is setting G_PipelineStages to 3, which we’ve already done at the top. We can now remove the -- TODO comment.

The code now looks like this:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;
use work.SpriteRom.all;

entity VerticalSpritePipeline is
    generic (
        --@ Width of the Y position (Line) register
        G_Y_Width        : integer := 10;
        --@ The height of the sprite in pixels
        G_Sprite_Height  : integer := 16;
        --@ Width of the sprite offset (Line address) register
        G_Offset_Width   : integer := 8;
        --@ The pipeline stages for the calculating pipeline (multiply by 2 for the the latency of the pipeline)
        G_PipelineStages : integer := 2
        );
    port (
        --@ Clock signal; (**Rising edge** triggered)
        I_CLK : in std_logic := '0';
        --@ Clock enable signal (**Active high**)
        I_CE  : in std_logic := '1';

        --@ @virtualbus VSpritePipeline-OP @dir In Vertical sprite pipeline operation interfacee
        --@ AXI like ready; (**Synchronous**, **Active high**)
        O_VSpritePipeline_OP_Ready     : out std_logic                                := '0';
        --@ AXI like valid; (**Synchronous**, **Active high**)
        I_VSpritePipeline_OP_Valid     : in  std_logic                                := '0';
        --@ The line to check if the sprite is in the line visible.
        I_VSpritePipeline_OP_Y_Request : in  std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
        --@ The sprite Y position to check if the sprite is in the line visible.
        I_VSpritePipeline_OP_Y_Sprite  : in  std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
        --@ @end

        --@ @virtualbus VSpritePipeline-Result @dir Out Vertical sprite pipeline result interface
        --@ AXI like ready; (**Synchronous**, **Active high**)
        I_VSpritePipeline_Ready     : in  std_logic                                     := '0';
        --@ AXI like valid; (**Synchronous**, **Active high**)
        O_VSpritePipeline_Valid     : out std_logic                                     := '0';
        --@ Indicates if the sprite is visible in the line.
        O_VSpritePipeline_IsVisible : out std_logic                                     := '0';
        --@ The calculated offset address of the sprite.
        O_VSpritePipeline_Offset    : out std_logic_vector(G_Offset_Width - 1 downto 0) := (others => '0')
     --@ @end
        );
end entity VerticalSpritePipeline;

architecture Rtl of VerticalSpritePipeline is
    --@ Line to check if the sprite is in the line visible
    signal R0_Y_Request : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
    signal R1_Y_Request : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
    --@ The sprite Y position to check if the sprite is in the line visible
    signal R0_Y_Sprite  : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
    signal R1_Y_Sprite  : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');

    --@ The bottom Y position of the sprite
    signal C_Y_Bottom_Sprite : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');
    signal R_Y_Bottom_Sprite : std_logic_vector(G_Y_Width - 1 downto 0) := (others => '0');

    --@ Calculated visibility signal
    signal C_IsVisible : std_logic                                     := '0';
    --@ The calculated offset address of the sprite
    signal C_Offset    : std_logic_vector(G_Offset_Width - 1 downto 0) := (others => '0');

    --@ Pipeline enable signal
    signal O_VSpritePipeline_Ctrl_Enable : std_logic := '0';
begin
    INST_VSpritePipeline_Ctrl : entity work.PipelineController
        generic map(
            G_PipelineStages => 3
            )
        port map(
            I_CLK    => I_CLK,
            I_CE     => I_CE,
            O_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Valid  => I_VSpritePipeline_OP_Valid,
            O_Ready  => O_VSpritePipeline_OP_Ready,
            O_Valid  => O_VSpritePipeline_Valid,
            I_Ready  => I_VSpritePipeline_Ready
            );

    INST0_VSpritePipeline_Y_Sprite : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Sprite,
            O_Data   => R0_Y_Sprite
            );

    INST0_VSpritePipeline_Y_Request : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => I_VSpritePipeline_OP_Y_Request,
            O_Data   => R0_Y_Request
            );

    --@ Calculate the bottom Y position of the sprite
    C_Y_Bottom_Sprite <= std_logic_vector(
        unsigned(R0_Y_Sprite) + to_unsigned(G_Sprite_Height - 1, G_Y_Width)
        );

    INST_VSpritePipeline_Y_Bottom_Sprite : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => C_Y_Bottom_Sprite,
            O_Data   => R_Y_Bottom_Sprite
            );

    INST1_VSpritePipeline_Y_Sprite : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => R0_Y_Sprite,
            O_Data   => R1_Y_Sprite
            );

    INST1_VSpritePipeline_Y_Request : entity work.PipelineRegister
        generic map(
            G_Width => G_Y_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => R0_Y_Request,
            O_Data   => R1_Y_Request
            );

    --@ Calculate the visibility of the sprite
    C_IsVisible                 <= '1' when (
        (unsigned(R1_Y_Request) >= unsigned(R1_Y_Sprite)) and
        (unsigned(R1_Y_Request) <= unsigned(R_Y_Bottom_Sprite))
        ) else '0';

    --@ Calculate the offset address of the sprite
    C_Offset <= std_logic_vector(
        to_unsigned(
            K_SPRITE_ROW_OFFSETS(to_integer(unsigned(R1_Y_Request) - unsigned(R1_Y_Sprite))),
            C_Offset'length)
        );

    INST_IsVisible_OutputRegister : entity work.PipelineRegister
        generic map(
            G_Width => 1
            )
        port map(
            I_CLK     => I_CLK,
            I_Enable  => O_VSpritePipeline_Ctrl_Enable,
            I_Data(0) => C_IsVisible,
            O_Data(0) => O_VSpritePipeline_IsVisible
            );

    INST_Offset_OutputRegister : entity work.PipelineRegister
        generic map(
            G_Width => G_Offset_Width
            )
        port map(
            I_CLK    => I_CLK,
            I_Enable => O_VSpritePipeline_Ctrl_Enable,
            I_Data   => C_Offset,
            O_Data   => O_VSpritePipeline_Offset
            );
end architecture;

Compared to the previous version, we now have a pre-processing calculation (stage 1) and have parallelized the visibility and offset calculations (stage 2). This should provide a slight performance improvement. In my overall design, the pipeline has gone from being a bottleneck to a timing-irrelevant component.

During synthesis (pre place & route), we achieved a significant improvement on the used Spartan 3:

Before: Minimum period: 5.739ns (Max Frequency: 174.246MHz)
After: Minimum period: 4.857ns (Max Frequency: 205.888MHz)

Thus, we’ve successfully accelerated the pipeline and simplified the code.

Built with Hugo
Theme Stack designed by Jimmy