[Lancelot] Slow lower order byte on fse_d?

I've spent quite some time now, trying to understand why I couldn't run 
the SRAM as fast as I expected and have reached the unavoidable 
conclusion that the lower order byte on fse_d is slower then the rest (= 
has a higher captive load?).  Incidently, the only thing that's 
different about this byte is that it's connected to the AMD flash ram 
also, unlike the other bytes.

It would be nice to know if anyone else have seen this.

I have a little test design to illustrate (below).  When run with 
WITH_READ_WAIT=1 it succeeds (all leds lit), WITH_READ_WAIT=0 it fails 
and from the debugging output we find that it saw ..ca8601 instead of 
the expected ..ca8643  Note, this is with a 50Mhz, so we are well within 
the 10ns for the fastest read cycle.

I've run many experiments, and they all agree, only the lower byte lag 
behind.  AFAICT, I've correctly disabled the flash RAM and the Ethernet. 
  I'm at a loss.


`timescale 1ns/10ps
module tester(input wire clk20ns,  // 50MHz - 20ns period
               input wire reset_n,

               output reg  [17:0] sram_a,
               inout  wire [31:0] sram_d,
               output reg         sram_cs_n,
               output reg   [3:0] sram_be_n,
               output reg         sram_oe_n,
               output reg         sram_we_n,
               output reg  [31:0] debug);

    reg [31:0]        sram_d_out    = 32'd0;
    reg               sram_d_out_en = 1'b0;
    assign            sram_d        = sram_d_out_en ? sram_d_out : 

    parameter   N = 3*4;
    parameter   SEED = 33'h87654321;
    parameter   WITH_READ_WAIT = 0;  // Flip this to insert a wait state 
for reading.

    parameter   S_START        = 4'd0;
    parameter   S_PRE_WRITE    = 4'd1;
    parameter   S_WRITING      = 4'd2;
    parameter   S_WROTE        = 4'd3;
    parameter   S_READ         = 4'd4;
    parameter   S_SUCCEED      = 4'd5;
    parameter   S_FAILED       = 4'd6;
    parameter   S_READ_WAIT    = 4'd7; // Wait state for reading

    reg  [ 3:0] state = S_START;

    reg [17:0]  addr;
    reg  [32:0] lfsr;
    wire [32:0] lfsr_next = {lfsr[31:0], ~lfsr[32] ^ lfsr[19]};

    reg  [31:0] data_latched, data_expected;

    always @(posedge clk20ns) if (reset_n)
      state <= S_START;
    else case (state)
            S_START: begin
               sram_d_out_en    <= 1'b1;
               sram_oe_n        <= 1'b1;
               sram_cs_n        <= 1'b0; // Select SRAM
               sram_be_n        <= 4'b0;

               data_latched     <= 0;
               data_expected    <= 0;
               lfsr             <= SEED;
               addr             <= 18'd0;

               state            <= S_WROTE;

            /* Writing the SRAM. */

            S_PRE_WRITE: begin                  /* This phase presents 
the data. */
               sram_we_n        <= 1'b0;        /* Only transition 
allowed is we. */
               state            <= S_WRITING;
            S_WRITING: begin                    /* This phase writes the 
data. */
               sram_we_n        <= 1'b1;        /* Only transition 
allowed is we. */
               state            <= S_WROTE;
            S_WROTE: begin                      /* This phase holds the 
data. */
               if (addr != N) begin
                  sram_a        <= addr;
                  sram_d_out    <= lfsr[31:0];

                  addr          <= addr + 18'd1;
                  lfsr          <= lfsr_next;

                  state         <= S_PRE_WRITE;
               end else begin
                  sram_a        <= 18'd0;
                  sram_oe_n     <= 1'b0;
                  sram_d_out_en <= 1'b0;
                  sram_d_out    <= 32'h0;

                  addr	       <= 18'd1;
                  lfsr          <= SEED;

//                 state         <= WITH_READ_WAIT ? S_READ_WAIT : S_READ;
                  state         <= S_READ_WAIT;

            /* Verifying the SRAM. */

            S_READ_WAIT: begin
                  state         <= S_READ;
            S_READ: begin
               if (data_latched != data_expected)
                 state          <= S_FAILED;
               else if (sram_a == N)
                 state          <= S_SUCCEED;
               else begin
                  sram_a        <= addr;
                  data_latched  <= sram_d;
                  data_expected <= lfsr[31:0];

                  addr          <= addr + 18'd1;
                  lfsr          <= lfsr_next;

                  state         <= WITH_READ_WAIT ? S_READ_WAIT : S_READ;
            S_SUCCEED: begin
               debug            <= ~0;
               $display("Test passed!");
            S_FAILED: begin
               debug           <= data_latched;
               $display("Failed at address %x: got %x, but expected %x",
                        sram_a, sram_d, lfsr);

module main();
    reg clk = 1;
    reg reset_n = 1;

    always #5 clk = ~clk;

    wire [31:0] sram_d;
    wire [17:0] sram_a;
    wire [ 3:0] sram_be_n;
    wire        sram_we_n,
    wire [31:0] debug;

    tester tester(clk, reset_n,
                  sram_a, sram_d, sram_cs_n, sram_be_n, sram_oe_n, 

    idt71v416s10 u35(sram_d[15: 0] , sram_a,
                     sram_we_n, sram_oe_n, sram_cs_n,
                     sram_be_n[0], sram_be_n[1]); // Yep, strange order...

    idt71v416s10 u36(sram_d[31:16], sram_a,
                     sram_we_n, sram_oe_n, sram_cs_n,
                     sram_be_n[2], sram_be_n[3]);
    initial begin
       $monitor($time, " %x %x", sram_a, sram_d);
       #321 reset_n = 0;
module main(input  wire         clkin,
             input  wire         reset_n,

             // Flash-SRAM-Ethernet bus
             output wire         flash_cs_n,   // Flash ROM CS#
             output wire         flash_oe_n,

             output wire  [22:0] fse_a,
             inout  wire  [31:0] fse_d,
             output wire         sram_cs_n,
             output wire   [3:0] sram_be_n,
             output wire         sram_oe_n,
             output wire         sram_we_n,
             output wire   [7:0] led,
             output wire   [7:0] s7_0,
             output wire   [7:0] s7_1,

             output wire         enet_aen,    // Ethernet Access Enable
             output wire   [3:0] enet_be_n    // Ethernet byte enables

    reg  [31:0]      counter;
    wire             my_reset     = counter == 0;

    wire [31:0]      debug;
    assign           led          = my_reset ? debug[23:16] : 8'hF0;
    assign           {s7_1,s7_0}  = ~debug[15:0];
    assign           fse_a[1:0]   = 2'd0;
    assign           fse_a[22:20] = 3'd0;

    assign           flash_cs_n   = 1'b1;
    assign           flash_oe_n   = 1'b1;
    assign           enet_aen     = 1'b1;
    assign           enet_be_n    = 4'b1111;

    always @(posedge clkin) begin
       if (~reset_n)
         counter <= 100000000;
       else if (counter)
         counter <= counter - 1;

    tester tester(clkin, ~my_reset,
                  fse_a[19:2], fse_d, sram_cs_n, sram_be_n, sram_oe_n, 
       0000          0000
      5    1        5    1
      5    1        5    1
       6666          6666
      4    2        4    2
      4    2        4    2
       3333   77     3333   77

       s7_1          s7_0


