[GCC] RISC-V: Avoid zero/sign extend for volatile loads.

Posted on 13 February 202121 July 2021 by [email protected]

This bug was initially raised by one of the developers on GCC Bugzilla

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97417

Where for a simple volatile load/store like:

#include <stdbool.h>

extern volatile bool active;

int foo(void)
{
  if (!active) {
    return 42;
  } else {
    return -42;
  }
}

gcc will generate:

foo:
        lui     a5,%hi(active)
        lbu     a5,%lo(active)(a5)
        li      a0,42
        andi    a5,a5,0xff
        beq     a5,zero,.L1
        li      a0,-42
.L1:
        ret

However, the definition of lbu from RISC-V spec mentioned:

” The LW instruction loads a 32-bit value from memory and sign-extends this to 64 bits before storing it in register rd for RV64I. The LWU instruction, on the other hand, zero-extends the 32-bit value from memory for RV64I. LH and LHU are defined analogously for 16-bit values, as are LB and LBU for 8-bit values. The SD, SW, SH, and SB instructions store 64-bit, 32-bit, 16-bit, and 8-bit values from the low bits of register rs2 to memory respectively. “

Therefore that “andi a5,a5,0xff” is obvious needless.

Jim replied:

Comparing with the ARM port, I see that in the ARM port, the movqi pattern emits
(insn 8 7 9 2 (set (reg:SI 117)
(zero_extend:SI (mem/v/c:QI (reg/f:SI 115) [1 active+0 S1 A8]))) “tmp.c\ “:7:7 -1
(nil))
(insn 9 8 10 2 (set (reg:QI 116)
(subreg:QI (reg:SI 117) 0)) “tmp.c”:7:7 -1
(nil))
and then later it combines the subreg operation with the following zero_extend and cancels them out.

Whereas in the RISC-V port, the movqi pattern emits
(insn 9 7 10 2 (set (reg:QI 76)
(mem/v/c:QI (lo_sum:DI (reg:DI 74)
(symbol_ref:DI (“active”) [flags 0xc4] <var_decl 0x7f9f0310312\ 0 active>)) [1 active+0 S1 A8])) “tmp.c”:7:7 -1
(nil))
and then combine refuses to combine the following zero-extend with this insn as the memory operation is volatile. So it seems we need to rewrite the RISC-V port to make movqi and movhi zero extend to si/di mode and then subreg. That probably will require cascading changes to avoid code size and performance regressions.

Looks like a tractable small to medium size project, but will need to wait for a volunteer to work on it.

So the solution looks simple, all we need is change how it moves in gcc/config/riscv/riscv.c

--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -1524,6 +1524,28 @@ riscv_legitimize_const_move (machine_mode mode, rtx dest, rtx src)
 bool
 riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
 {
+  /* Expand 
+       (set (reg:QI target) (mem:QI (address))) 
+     to
+       (set (reg:DI temp) (zero_extend:DI (mem:QI (address))))
+       (set (reg:QI target) (subreg:QI (reg:DI temp) 0))
+     with auto-sign/zero extend.  */
+  if (GET_MODE_CLASS (mode) == MODE_INT
+      && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+      && can_create_pseudo_p ()
+      && MEM_P (src))
+    {
+      rtx temp_reg;
+      int zero_extend_p;
+
+      temp_reg = gen_reg_rtx (word_mode);
+      zero_extend_p = (LOAD_EXTEND_OP (mode) == ZERO_EXTEND);
+      emit_insn (gen_extend_insn (temp_reg, src, word_mode, mode, 
+                                 zero_extend_p));
+      riscv_emit_move (dest, gen_lowpart (mode, temp_reg));
+      return true;
+    }
+
   if (!register_operand (dest, mode) && !reg_or_0_operand (src, mode))
     {
       rtx reg;

now it appears that we successfully fix this issue, but some of the tests case with the -Os flag fails.

Given following program:

int load1r (int *array)
{
  int a = 0;
  a += array[200];
  a += array[201];
  a += array[202];
  a += array[203];
  return a;
}

with pass_shorten_memrefs defined in gcc/config/riscv/riscv-passes.def, when compiled with

gcc main.c -S -Os

supposedly we should get

load1r:
	addi	a5,a0,768
	lw	a4,36(a5)
	lw	a0,32(a5)
	addw	a0,a0,a4
	lw	a4,40(a5)
	addw	a4,a4,a0
	lw	a0,44(a5)
	addw	a0,a0,a4
	ret

the option -Os should be able to recognize that 4 sequential access to ram and make it in a form of Base+Offset (768+32) (768+36)… to reduce code size. But instead, we get:

load1r:
	lwu	a4,800(a0)
	lwu	a5,804(a0)
	addw	a5,a5,a4
	lwu	a4,808(a0)
	lwu	a0,812(a0)
	addw	a5,a5,a4
	addw	a0,a5,a0
	ret

So it looks like to me that shorten_memrefs pass fails to recognize the pattern with zero/sign extend, additional change to that pass needs to be done:

--- a/gcc/config/riscv/riscv-shorten-memrefs.c
+++ b/gcc/config/riscv/riscv-shorten-memrefs.c
@@ -75,12 +75,19 @@ private:
 
   regno_map * analyze (basic_block bb);
   void transform (regno_map *m, basic_block bb);
-  bool get_si_mem_base_reg (rtx mem, rtx *addr);
+  bool get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend);
 }; // class pass_shorten_memrefs
 
 bool
-pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr)
+pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend)
 {
+  /* Whether it's sign/zero extended.  */
+  if (GET_CODE (mem) == ZERO_EXTEND || GET_CODE (mem) == SIGN_EXTEND)
+    {
+      *extend = true;
+      mem = XEXP (mem, 0);
+    }
+
   if (!MEM_P (mem) || GET_MODE (mem) != SImode)
     return false;
   *addr = XEXP (mem, 0);
@@ -110,7 +117,8 @@ pass_shorten_memrefs::analyze (basic_block bb)
        {
          rtx mem = XEXP (pat, i);
          rtx addr;
-         if (get_si_mem_base_reg (mem, &addr))
+         bool extend = false;
+         if (get_si_mem_base_reg (mem, &addr, &extend))
            {
              HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
              /* Do not count store zero as these cannot be compressed.  */
@@ -150,7 +158,8 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block bb)
        {
          rtx mem = XEXP (pat, i);
          rtx addr;
-         if (get_si_mem_base_reg (mem, &addr))
+         bool extend = false;
+         if (get_si_mem_base_reg (mem, &addr, &extend))
            {
              HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
              /* Do not transform store zero as these cannot be compressed.  */
@@ -161,9 +170,20 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block bb)
                }
              if (m->get_or_insert (regno) > 3)
                {
-                 addr
-                   = targetm.legitimize_address (addr, addr, GET_MODE (mem));
-                 XEXP (pat, i) = replace_equiv_address (mem, addr);
+                 if (extend)
+                   {
+                     addr
+                       = targetm.legitimize_address (addr, addr,
+                                                     GET_MODE (XEXP (mem, 0)));
+                     XEXP (XEXP (pat, i), 0)
+                       = replace_equiv_address (XEXP (mem, 0), addr);
+                   }
+                 else
+                   {
+                     addr = targetm.legitimize_address (addr, addr,
+                                                        GET_MODE (mem));
+                     XEXP (pat, i) = replace_equiv_address (mem, addr);
+                   }
                  df_insn_rescan (insn);
                }
            }

However there are still side effects in other passes of gcc backend. The combine pass refuses to do a 2-2 insn combine due to the extra zero/sign extend. My solution is to change that pass:

--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2635,6 +2635,22 @@ is_just_move (rtx x)
   return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
 }
 
+/* Return whether X is just a single set, with the source
+   a sign/zero extended general_operand.  */
+static bool
+is_just_extended_move (rtx x)
+{
+  if (INSN_P (x))
+    x = PATTERN (x);
+
+  if(GET_CODE (x) == SET)
+      if (GET_CODE (SET_SRC (x)) == ZERO_EXTEND 
+	      || GET_CODE (SET_SRC (x)) == SIGN_EXTEND)
+		return (general_operand ((XEXP (SET_SRC (x), 0)), VOIDmode));
+
+  return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
+}
+
 /* Callback function to count autoincs.  */
 
 static int
@@ -3103,8 +3119,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
     }
 
   /* Record whether i2 and i3 are trivial moves.  */
-  i2_was_move = is_just_move (i2);
-  i3_was_move = is_just_move (i3);
+  i2_was_move = is_just_move (i2) || is_just_extended_move(i2);
+  i3_was_move = is_just_move (i3) || is_just_extended_move(i3);
 
   /* Record whether I2DEST is used in I2SRC and similarly for the other
      cases.  Knowing this will help in register status updating below.  */

But it will affect all other backend, So Jim proposed the solution:

The combine change is inconvenient. We can’t do that in stage3, and it means we need to make sure that this doesn’t break other targets.

If the combine change is a good idea, then I think you can just modify is_just_move rather than add a new function. Just skip over a ZERO_EXTEND or SIGN_EXTEND before the the general_operand check. We might need a mode check against UNITS_PER_WORD since extending past the word size is not necessarily a simple move.

So the problem stems from the code in combine that is willing to do a 2->2 split if neither original instruction is a simple move. When we add a SIGN_EXTEND or ZERO_EXTEND they aren’t considered simple moves anymore.

There is still the question of why the instruction cost allows the change. First I noticed that riscv_address_cost has a hook to check for shorten_memrefs but that riscv_rtx_costs isn’t calling it. It uses riscv_address_insns instead. So it seems like adding a shorten_memref check to the MEM case riscv_rtx_costs might solve the problem. But riscv_compressed_lw_address_p has a !reload_completed check, and combine runs before reload, so that returns the same result for both the new and old insns. I think that reload_completed check isn’t quite right. If we have a pseudo-reg, then we should allow that, but if we have an offset that is clearly not compressible, then we should reject it before reload. So I think the reload_completed check should be moved down to where it checks for a compressible register. With those two changes, I can make the testcase work without a combine change. Though since I changed how shorten_memrefs works we need a check to make sure this patch doens’t change code size. I haven’t tried to do that yet.

With my changes, in the combine dump, I see Successfully matched this instruction:
(set (reg/f:DI 92)
(plus:DI (reg:DI 96)
(const_int 768 [0x300])))
Successfully matched this instruction:
(set (reg:DI 82 [ MEM[(intD.1 *)array_5(D) + 800B] ])
(sign_extend:DI (mem:SI (plus:DI (reg:DI 96)
(const_int 800 [0x320])) [1 MEM[(intD.1 *)array_5(D) + 800B]+0 S4 A32])))
rejecting combination of insns 27 and 6 original costs 4 + 16 = 20 replacement costs 4 + 20 = 24

So Jim’s change:

--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -891,17 +891,13 @@ riscv_compressed_lw_address_p (rtx x)
   bool result = riscv_classify_address (&addr, x, GET_MODE (x),
                                        reload_completed);
 
-  /* Before reload, assuming all load/stores of valid addresses get compressed
-     gives better code size than checking if the address is reg + small_offset
-     early on.  */
-  if (result && !reload_completed)
-    return true;
-
   /* Return false if address is not compressed_reg + small_offset.  */
   if (!result
       || addr.type != ADDRESS_REG
-      || (!riscv_compressed_reg_p (REGNO (addr.reg))
-           && addr.reg != stack_pointer_rtx)
+      /* Before reload, assume all registers are OK.  */
+      || (reload_completed
+         && !riscv_compressed_reg_p (REGNO (addr.reg))
+         && addr.reg != stack_pointer_rtx)
       || !riscv_compressed_lw_offset_p (addr.offset))
     return false;
 
@@ -1708,6 +1704,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         instructions it needs.  */
       if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0)
        {
+         /* When optimizing for size, make uncompressible 32-bit addresses
+            more expensive so that compressible 32-bit addresses are
+            preferred.  */
+         if (TARGET_RVC && !speed && riscv_mshorten_memrefs && mode == SImode
+             && !riscv_compressed_lw_address_p (XEXP (x, 0)))
+           cost++;
+
          *total = COSTS_N_INSNS (cost + tune_param->memory_cost);
          return true;
        }

Here are 2 final commits to gcc master branch:

https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a4953810bac524e19126a2745c75fed58db962c2

https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=18fabc35f47f0abf4ec14d147098ec4e0734d2a3

Leave a Reply Cancel reply