This bug was initially raised by one of the developers on GCC Bugzilla
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97417
Where for a simple volatile load/store like:
#include <stdbool.h>
extern volatile bool active;
int foo(void)
{
if (!active) {
return 42;
} else {
return -42;
}
}
gcc will generate:
foo:
lui a5,%hi(active)
lbu a5,%lo(active)(a5)
li a0,42
andi a5,a5,0xff
beq a5,zero,.L1
li a0,-42
.L1:
ret
However, the definition of lbu from RISC-V spec mentioned:
” The LW instruction loads a 32-bit value from memory and sign-extends this to 64 bits before storing it in register rd for RV64I. The LWU instruction, on the other hand, zero-extends the 32-bit value from memory for RV64I. LH and LHU are defined analogously for 16-bit values, as are LB and LBU for 8-bit values. The SD, SW, SH, and SB instructions store 64-bit, 32-bit, 16-bit, and 8-bit values from the low bits of register rs2 to memory respectively. “
Therefore that “andi a5,a5,0xff” is obvious needless.
Jim replied:
Comparing with the ARM port, I see that in the ARM port, the movqi pattern emits
(insn 8 7 9 2 (set (reg:SI 117)
(zero_extend:SI (mem/v/c:QI (reg/f:SI 115) [1 active+0 S1 A8]))) “tmp.c\ “:7:7 -1
(nil))
(insn 9 8 10 2 (set (reg:QI 116)
(subreg:QI (reg:SI 117) 0)) “tmp.c”:7:7 -1
(nil))
and then later it combines the subreg operation with the following zero_extend and cancels them out.
Whereas in the RISC-V port, the movqi pattern emits
(insn 9 7 10 2 (set (reg:QI 76)
(mem/v/c:QI (lo_sum:DI (reg:DI 74)
(symbol_ref:DI (“active”) [flags 0xc4] <var_decl 0x7f9f0310312\ 0 active>)) [1 active+0 S1 A8])) “tmp.c”:7:7 -1
(nil))
and then combine refuses to combine the following zero-extend with this insn as the memory operation is volatile. So it seems we need to rewrite the RISC-V port to make movqi and movhi zero extend to si/di mode and then subreg. That probably will require cascading changes to avoid code size and performance regressions.
Looks like a tractable small to medium size project, but will need to wait for a volunteer to work on it.
So the solution looks simple, all we need is change how it moves in gcc/config/riscv/riscv.c
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -1524,6 +1524,28 @@ riscv_legitimize_const_move (machine_mode mode, rtx dest, rtx src)
bool
riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
{
+ /* Expand
+ (set (reg:QI target) (mem:QI (address)))
+ to
+ (set (reg:DI temp) (zero_extend:DI (mem:QI (address))))
+ (set (reg:QI target) (subreg:QI (reg:DI temp) 0))
+ with auto-sign/zero extend. */
+ if (GET_MODE_CLASS (mode) == MODE_INT
+ && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+ && can_create_pseudo_p ()
+ && MEM_P (src))
+ {
+ rtx temp_reg;
+ int zero_extend_p;
+
+ temp_reg = gen_reg_rtx (word_mode);
+ zero_extend_p = (LOAD_EXTEND_OP (mode) == ZERO_EXTEND);
+ emit_insn (gen_extend_insn (temp_reg, src, word_mode, mode,
+ zero_extend_p));
+ riscv_emit_move (dest, gen_lowpart (mode, temp_reg));
+ return true;
+ }
+
if (!register_operand (dest, mode) && !reg_or_0_operand (src, mode))
{
rtx reg;
now it appears that we successfully fix this issue, but some of the tests case with the -Os flag fails.
Given following program:
int load1r (int *array)
{
int a = 0;
a += array[200];
a += array[201];
a += array[202];
a += array[203];
return a;
}
with pass_shorten_memrefs defined in gcc/config/riscv/riscv-passes.def, when compiled with
gcc main.c -S -Os
supposedly we should get
load1r: addi a5,a0,768 lw a4,36(a5) lw a0,32(a5) addw a0,a0,a4 lw a4,40(a5) addw a4,a4,a0 lw a0,44(a5) addw a0,a0,a4 ret
the option -Os should be able to recognize that 4 sequential access to ram and make it in a form of Base+Offset (768+32) (768+36)… to reduce code size. But instead, we get:
load1r: lwu a4,800(a0) lwu a5,804(a0) addw a5,a5,a4 lwu a4,808(a0) lwu a0,812(a0) addw a5,a5,a4 addw a0,a5,a0 ret
So it looks like to me that shorten_memrefs pass fails to recognize the pattern with zero/sign extend, additional change to that pass needs to be done:
--- a/gcc/config/riscv/riscv-shorten-memrefs.c
+++ b/gcc/config/riscv/riscv-shorten-memrefs.c
@@ -75,12 +75,19 @@ private:
regno_map * analyze (basic_block bb);
void transform (regno_map *m, basic_block bb);
- bool get_si_mem_base_reg (rtx mem, rtx *addr);
+ bool get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend);
}; // class pass_shorten_memrefs
bool
-pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr)
+pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend)
{
+ /* Whether it's sign/zero extended. */
+ if (GET_CODE (mem) == ZERO_EXTEND || GET_CODE (mem) == SIGN_EXTEND)
+ {
+ *extend = true;
+ mem = XEXP (mem, 0);
+ }
+
if (!MEM_P (mem) || GET_MODE (mem) != SImode)
return false;
*addr = XEXP (mem, 0);
@@ -110,7 +117,8 @@ pass_shorten_memrefs::analyze (basic_block bb)
{
rtx mem = XEXP (pat, i);
rtx addr;
- if (get_si_mem_base_reg (mem, &addr))
+ bool extend = false;
+ if (get_si_mem_base_reg (mem, &addr, &extend))
{
HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
/* Do not count store zero as these cannot be compressed. */
@@ -150,7 +158,8 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block bb)
{
rtx mem = XEXP (pat, i);
rtx addr;
- if (get_si_mem_base_reg (mem, &addr))
+ bool extend = false;
+ if (get_si_mem_base_reg (mem, &addr, &extend))
{
HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
/* Do not transform store zero as these cannot be compressed. */
@@ -161,9 +170,20 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block bb)
}
if (m->get_or_insert (regno) > 3)
{
- addr
- = targetm.legitimize_address (addr, addr, GET_MODE (mem));
- XEXP (pat, i) = replace_equiv_address (mem, addr);
+ if (extend)
+ {
+ addr
+ = targetm.legitimize_address (addr, addr,
+ GET_MODE (XEXP (mem, 0)));
+ XEXP (XEXP (pat, i), 0)
+ = replace_equiv_address (XEXP (mem, 0), addr);
+ }
+ else
+ {
+ addr = targetm.legitimize_address (addr, addr,
+ GET_MODE (mem));
+ XEXP (pat, i) = replace_equiv_address (mem, addr);
+ }
df_insn_rescan (insn);
}
}
However there are still side effects in other passes of gcc backend. The combine pass refuses to do a 2-2 insn combine due to the extra zero/sign extend. My solution is to change that pass:
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2635,6 +2635,22 @@ is_just_move (rtx x)
return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
}
+/* Return whether X is just a single set, with the source
+ a sign/zero extended general_operand. */
+static bool
+is_just_extended_move (rtx x)
+{
+ if (INSN_P (x))
+ x = PATTERN (x);
+
+ if(GET_CODE (x) == SET)
+ if (GET_CODE (SET_SRC (x)) == ZERO_EXTEND
+ || GET_CODE (SET_SRC (x)) == SIGN_EXTEND)
+ return (general_operand ((XEXP (SET_SRC (x), 0)), VOIDmode));
+
+ return (GET_CODE (x) == SET && general_operand (SET_SRC (x), VOIDmode));
+}
+
/* Callback function to count autoincs. */
static int
@@ -3103,8 +3119,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
}
/* Record whether i2 and i3 are trivial moves. */
- i2_was_move = is_just_move (i2);
- i3_was_move = is_just_move (i3);
+ i2_was_move = is_just_move (i2) || is_just_extended_move(i2);
+ i3_was_move = is_just_move (i3) || is_just_extended_move(i3);
/* Record whether I2DEST is used in I2SRC and similarly for the other
cases. Knowing this will help in register status updating below. */
But it will affect all other backend, So Jim proposed the solution:
The combine change is inconvenient. We can’t do that in stage3, and it means we need to make sure that this doesn’t break other targets.
If the combine change is a good idea, then I think you can just modify is_just_move rather than add a new function. Just skip over a ZERO_EXTEND or SIGN_EXTEND before the the general_operand check. We might need a mode check against UNITS_PER_WORD since extending past the word size is not necessarily a simple move.
So the problem stems from the code in combine that is willing to do a 2->2 split if neither original instruction is a simple move. When we add a SIGN_EXTEND or ZERO_EXTEND they aren’t considered simple moves anymore.
There is still the question of why the instruction cost allows the change. First I noticed that riscv_address_cost has a hook to check for shorten_memrefs but that riscv_rtx_costs isn’t calling it. It uses riscv_address_insns instead. So it seems like adding a shorten_memref check to the MEM case riscv_rtx_costs might solve the problem. But riscv_compressed_lw_address_p has a !reload_completed check, and combine runs before reload, so that returns the same result for both the new and old insns. I think that reload_completed check isn’t quite right. If we have a pseudo-reg, then we should allow that, but if we have an offset that is clearly not compressible, then we should reject it before reload. So I think the reload_completed check should be moved down to where it checks for a compressible register. With those two changes, I can make the testcase work without a combine change. Though since I changed how shorten_memrefs works we need a check to make sure this patch doens’t change code size. I haven’t tried to do that yet.
With my changes, in the combine dump, I see Successfully matched this instruction:
(set (reg/f:DI 92)
(plus:DI (reg:DI 96)
(const_int 768 [0x300])))
Successfully matched this instruction:
(set (reg:DI 82 [ MEM[(intD.1 *)array_5(D) + 800B] ])
(sign_extend:DI (mem:SI (plus:DI (reg:DI 96)
(const_int 800 [0x320])) [1 MEM[(intD.1 *)array_5(D) + 800B]+0 S4 A32])))
rejecting combination of insns 27 and 6 original costs 4 + 16 = 20 replacement costs 4 + 20 = 24
So Jim’s change:
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -891,17 +891,13 @@ riscv_compressed_lw_address_p (rtx x)
bool result = riscv_classify_address (&addr, x, GET_MODE (x),
reload_completed);
- /* Before reload, assuming all load/stores of valid addresses get compressed
- gives better code size than checking if the address is reg + small_offset
- early on. */
- if (result && !reload_completed)
- return true;
-
/* Return false if address is not compressed_reg + small_offset. */
if (!result
|| addr.type != ADDRESS_REG
- || (!riscv_compressed_reg_p (REGNO (addr.reg))
- && addr.reg != stack_pointer_rtx)
+ /* Before reload, assume all registers are OK. */
+ || (reload_completed
+ && !riscv_compressed_reg_p (REGNO (addr.reg))
+ && addr.reg != stack_pointer_rtx)
|| !riscv_compressed_lw_offset_p (addr.offset))
return false;
@@ -1708,6 +1704,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
instructions it needs. */
if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0)
{
+ /* When optimizing for size, make uncompressible 32-bit addresses
+ more expensive so that compressible 32-bit addresses are
+ preferred. */
+ if (TARGET_RVC && !speed && riscv_mshorten_memrefs && mode == SImode
+ && !riscv_compressed_lw_address_p (XEXP (x, 0)))
+ cost++;
+
*total = COSTS_N_INSNS (cost + tune_param->memory_cost);
return true;
}
Here are 2 final commits to gcc master branch:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a4953810bac524e19126a2745c75fed58db962c2
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=18fabc35f47f0abf4ec14d147098ec4e0734d2a3
