--- /dev/null
+From fd37da0d586c331b0008fbfd653a9659344fe76f Mon Sep 17 00:00:00 2001
+From: Mike Pall <mike>
+Date: Wed, 26 Jul 2017 09:52:19 +0200
+Subject: [PATCH] PPC: Add soft-float support to interpreter.
+
+Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+Sponsored by Cisco Systems, Inc.
+---
+ src/host/buildvm_asm.c | 2 +-
+ src/lj_arch.h | 29 +-
+ src/lj_ccall.c | 38 +-
+ src/lj_ccall.h | 4 +-
+ src/lj_ccallback.c | 30 +-
+ src/lj_frame.h | 2 +-
+ src/lj_ircall.h | 2 +-
+ src/vm_ppc.dasc | 1249 +++++++++++++++++++++++++++++++++-------
+ 8 files changed, 1101 insertions(+), 255 deletions(-)
+
+--- a/src/host/buildvm_asm.c
++++ b/src/host/buildvm_asm.c
+@@ -338,7 +338,7 @@ void emit_asm(BuildCtx *ctx)
+ #if !(LJ_TARGET_PS3 || LJ_TARGET_PSVITA)
+ fprintf(ctx->fp, "\t.section .note.GNU-stack,\"\"," ELFASM_PX "progbits\n");
+ #endif
+-#if LJ_TARGET_PPC && !LJ_TARGET_PS3
++#if LJ_TARGET_PPC && !LJ_TARGET_PS3 && !LJ_ABI_SOFTFP
+ /* Hard-float ABI. */
+ fprintf(ctx->fp, "\t.gnu_attribute 4, 1\n");
+ #endif
+--- a/src/lj_arch.h
++++ b/src/lj_arch.h
+@@ -254,6 +254,29 @@
+ #else
+ #define LJ_ARCH_BITS 32
+ #define LJ_ARCH_NAME "ppc"
++
++#if !defined(LJ_ARCH_HASFPU)
++#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE)
++#define LJ_ARCH_HASFPU 0
++#else
++#define LJ_ARCH_HASFPU 1
++#endif
++#endif
++
++#if !defined(LJ_ABI_SOFTFP)
++#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE)
++#define LJ_ABI_SOFTFP 1
++#else
++#define LJ_ABI_SOFTFP 0
++#endif
++#endif
++#endif
++
++#if LJ_ABI_SOFTFP
++#define LJ_ARCH_NOJIT 1 /* NYI */
++#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL
++#else
++#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE
+ #endif
+
+ #define LJ_TARGET_PPC 1
+@@ -262,7 +285,6 @@
+ #define LJ_TARGET_MASKSHIFT 0
+ #define LJ_TARGET_MASKROT 1
+ #define LJ_TARGET_UNIFYROT 1 /* Want only IR_BROL. */
+-#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE
+
+ #if LJ_TARGET_CONSOLE
+ #define LJ_ARCH_PPC32ON64 1
+@@ -415,16 +437,13 @@
+ #error "No support for ILP32 model on ARM64"
+ #endif
+ #elif LJ_TARGET_PPC
+-#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE)
+-#error "No support for PowerPC CPUs without double-precision FPU"
+-#endif
+ #if !LJ_ARCH_PPC64 && LJ_ARCH_ENDIAN == LUAJIT_LE
+ #error "No support for little-endian PPC32"
+ #endif
+ #if LJ_ARCH_PPC64
+ #error "No support for PowerPC 64 bit mode (yet)"
+ #endif
+-#ifdef __NO_FPRS__
++#if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT)
+ #error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
+ #endif
+ #elif LJ_TARGET_MIPS32
+--- a/src/lj_ccall.c
++++ b/src/lj_ccall.c
+@@ -387,6 +387,24 @@
+ #define CCALL_HANDLE_COMPLEXARG \
+ /* Pass complex by value in 2 or 4 GPRs. */
+
++#define CCALL_HANDLE_GPR \
++ /* Try to pass argument in GPRs. */ \
++ if (n > 1) { \
++ lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \
++ if (ctype_isinteger(d->info) || ctype_isfp(d->info)) \
++ ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \
++ else if (ngpr + n > maxgpr) \
++ ngpr = maxgpr; /* Prevent reordering. */ \
++ } \
++ if (ngpr + n <= maxgpr) { \
++ dp = &cc->gpr[ngpr]; \
++ ngpr += n; \
++ goto done; \
++ } \
++
++#if LJ_ABI_SOFTFP
++#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR
++#else
+ #define CCALL_HANDLE_REGARG \
+ if (isfp) { /* Try to pass argument in FPRs. */ \
+ if (nfpr + 1 <= CCALL_NARG_FPR) { \
+@@ -395,24 +413,16 @@
+ d = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ \
+ goto done; \
+ } \
+- } else { /* Try to pass argument in GPRs. */ \
+- if (n > 1) { \
+- lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \
+- if (ctype_isinteger(d->info)) \
+- ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \
+- else if (ngpr + n > maxgpr) \
+- ngpr = maxgpr; /* Prevent reordering. */ \
+- } \
+- if (ngpr + n <= maxgpr) { \
+- dp = &cc->gpr[ngpr]; \
+- ngpr += n; \
+- goto done; \
+- } \
++ } else { \
++ CCALL_HANDLE_GPR \
+ }
++#endif
+
++#if !LJ_ABI_SOFTFP
+ #define CCALL_HANDLE_RET \
+ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+ ctr = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */
++#endif
+
+ #elif LJ_TARGET_MIPS32
+ /* -- MIPS o32 calling conventions ---------------------------------------- */
+@@ -1080,7 +1090,7 @@ static int ccall_set_args(lua_State *L,
+ }
+ if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */
+
+-#if LJ_TARGET_X64 || LJ_TARGET_PPC
++#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP)
+ cc->nfpr = nfpr; /* Required for vararg functions. */
+ #endif
+ cc->nsp = nsp;
+--- a/src/lj_ccall.h
++++ b/src/lj_ccall.h
+@@ -86,9 +86,9 @@ typedef union FPRArg {
+ #elif LJ_TARGET_PPC
+
+ #define CCALL_NARG_GPR 8
+-#define CCALL_NARG_FPR 8
++#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 8)
+ #define CCALL_NRET_GPR 4 /* For complex double. */
+-#define CCALL_NRET_FPR 1
++#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 1)
+ #define CCALL_SPS_EXTRA 4
+ #define CCALL_SPS_FREE 0
+
+--- a/src/lj_ccallback.c
++++ b/src/lj_ccallback.c
+@@ -419,6 +419,23 @@ void lj_ccallback_mcode_free(CTState *ct
+
+ #elif LJ_TARGET_PPC
+
++#define CALLBACK_HANDLE_GPR \
++ if (n > 1) { \
++ lua_assert(((LJ_ABI_SOFTFP && ctype_isnum(cta->info)) || /* double. */ \
++ ctype_isinteger(cta->info)) && n == 2); /* int64_t. */ \
++ ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \
++ } \
++ if (ngpr + n <= maxgpr) { \
++ sp = &cts->cb.gpr[ngpr]; \
++ ngpr += n; \
++ goto done; \
++ }
++
++#if LJ_ABI_SOFTFP
++#define CALLBACK_HANDLE_REGARG \
++ CALLBACK_HANDLE_GPR \
++ UNUSED(isfp);
++#else
+ #define CALLBACK_HANDLE_REGARG \
+ if (isfp) { \
+ if (nfpr + 1 <= CCALL_NARG_FPR) { \
+@@ -427,20 +444,15 @@ void lj_ccallback_mcode_free(CTState *ct
+ goto done; \
+ } \
+ } else { /* Try to pass argument in GPRs. */ \
+- if (n > 1) { \
+- lua_assert(ctype_isinteger(cta->info) && n == 2); /* int64_t. */ \
+- ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \
+- } \
+- if (ngpr + n <= maxgpr) { \
+- sp = &cts->cb.gpr[ngpr]; \
+- ngpr += n; \
+- goto done; \
+- } \
++ CALLBACK_HANDLE_GPR \
+ }
++#endif
+
++#if !LJ_ABI_SOFTFP
+ #define CALLBACK_HANDLE_RET \
+ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+ *(double *)dp = *(float *)dp; /* FPRs always hold doubles. */
++#endif
+
+ #elif LJ_TARGET_MIPS32
+
+--- a/src/lj_frame.h
++++ b/src/lj_frame.h
+@@ -226,7 +226,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CAL
+ #define CFRAME_OFS_L 36
+ #define CFRAME_OFS_PC 32
+ #define CFRAME_OFS_MULTRES 28
+-#define CFRAME_SIZE 272
++#define CFRAME_SIZE (LJ_ARCH_HASFPU ? 272 : 128)
+ #define CFRAME_SHIFT_MULTRES 3
+ #endif
+ #elif LJ_TARGET_MIPS32
+--- a/src/lj_ircall.h
++++ b/src/lj_ircall.h
+@@ -272,7 +272,7 @@ LJ_DATA const CCallInfo lj_ir_callinfo[I
+ #define fp64_f2l __aeabi_f2lz
+ #define fp64_f2ul __aeabi_f2ulz
+ #endif
+-#elif LJ_TARGET_MIPS
++#elif LJ_TARGET_MIPS || LJ_TARGET_PPC
+ #define softfp_add __adddf3
+ #define softfp_sub __subdf3
+ #define softfp_mul __muldf3
+--- a/src/vm_ppc.dasc
++++ b/src/vm_ppc.dasc
+@@ -103,6 +103,18 @@
+ |// Fixed register assignments for the interpreter.
+ |// Don't use: r1 = sp, r2 and r13 = reserved (TOC, TLS or SDATA)
+ |
++|.macro .FPU, a, b
++|.if FPU
++| a, b
++|.endif
++|.endmacro
++|
++|.macro .FPU, a, b, c
++|.if FPU
++| a, b, c
++|.endif
++|.endmacro
++|
+ |// The following must be C callee-save (but BASE is often refetched).
+ |.define BASE, r14 // Base of current Lua stack frame.
+ |.define KBASE, r15 // Constants of current Lua function.
+@@ -116,8 +128,10 @@
+ |.define TISNUM, r22
+ |.define TISNIL, r23
+ |.define ZERO, r24
++|.if FPU
+ |.define TOBIT, f30 // 2^52 + 2^51.
+ |.define TONUM, f31 // 2^52 + 2^51 + 2^31.
++|.endif
+ |
+ |// The following temporaries are not saved across C calls, except for RA.
+ |.define RA, r20 // Callee-save.
+@@ -133,6 +147,7 @@
+ |
+ |// Saved temporaries.
+ |.define SAVE0, r21
++|.define SAVE1, r25
+ |
+ |// Calling conventions.
+ |.define CARG1, r3
+@@ -141,8 +156,10 @@
+ |.define CARG4, r6 // Overlaps TMP3.
+ |.define CARG5, r7 // Overlaps INS.
+ |
++|.if FPU
+ |.define FARG1, f1
+ |.define FARG2, f2
++|.endif
+ |
+ |.define CRET1, r3
+ |.define CRET2, r4
+@@ -213,10 +230,16 @@
+ |.endif
+ |.else
+ |
++|.if FPU
+ |.define SAVE_LR, 276(sp)
+ |.define CFRAME_SPACE, 272 // Delta for sp.
+ |// Back chain for sp: 272(sp) <-- sp entering interpreter
+ |.define SAVE_FPR_, 128 // .. 128+18*8: 64 bit FPR saves.
++|.else
++|.define SAVE_LR, 132(sp)
++|.define CFRAME_SPACE, 128 // Delta for sp.
++|// Back chain for sp: 128(sp) <-- sp entering interpreter
++|.endif
+ |.define SAVE_GPR_, 56 // .. 56+18*4: 32 bit GPR saves.
+ |.define SAVE_CR, 52(sp) // 32 bit CR save.
+ |.define SAVE_ERRF, 48(sp) // 32 bit C frame info.
+@@ -226,16 +249,25 @@
+ |.define SAVE_PC, 32(sp)
+ |.define SAVE_MULTRES, 28(sp)
+ |.define UNUSED1, 24(sp)
++|.if FPU
+ |.define TMPD_LO, 20(sp)
+ |.define TMPD_HI, 16(sp)
+ |.define TONUM_LO, 12(sp)
+ |.define TONUM_HI, 8(sp)
++|.else
++|.define SFSAVE_4, 20(sp)
++|.define SFSAVE_3, 16(sp)
++|.define SFSAVE_2, 12(sp)
++|.define SFSAVE_1, 8(sp)
++|.endif
+ |// Next frame lr: 4(sp)
+ |// Back chain for sp: 0(sp) <-- sp while in interpreter
+ |
++|.if FPU
+ |.define TMPD_BLO, 23(sp)
+ |.define TMPD, TMPD_HI
+ |.define TONUM_D, TONUM_HI
++|.endif
+ |
+ |.endif
+ |
+@@ -245,7 +277,7 @@
+ |.else
+ | stw r..reg, SAVE_GPR_+(reg-14)*4(sp)
+ |.endif
+-| stfd f..reg, SAVE_FPR_+(reg-14)*8(sp)
++| .FPU stfd f..reg, SAVE_FPR_+(reg-14)*8(sp)
+ |.endmacro
+ |.macro rest_, reg
+ |.if GPR64
+@@ -253,7 +285,7 @@
+ |.else
+ | lwz r..reg, SAVE_GPR_+(reg-14)*4(sp)
+ |.endif
+-| lfd f..reg, SAVE_FPR_+(reg-14)*8(sp)
++| .FPU lfd f..reg, SAVE_FPR_+(reg-14)*8(sp)
+ |.endmacro
+ |
+ |.macro saveregs
+@@ -323,6 +355,7 @@
+ |// Trap for not-yet-implemented parts.
+ |.macro NYI; tw 4, sp, sp; .endmacro
+ |
++|.if FPU
+ |// int/FP conversions.
+ |.macro tonum_i, freg, reg
+ | xoris reg, reg, 0x8000
+@@ -346,6 +379,7 @@
+ |.macro toint, reg, freg
+ | toint reg, freg, freg
+ |.endmacro
++|.endif
+ |
+ |//-----------------------------------------------------------------------
+ |
+@@ -533,9 +567,19 @@ static void build_subroutines(BuildCtx *
+ | beq >2
+ |1:
+ | addic. TMP1, TMP1, -8
++ |.if FPU
+ | lfd f0, 0(RA)
++ |.else
++ | lwz CARG1, 0(RA)
++ | lwz CARG2, 4(RA)
++ |.endif
+ | addi RA, RA, 8
++ |.if FPU
+ | stfd f0, 0(BASE)
++ |.else
++ | stw CARG1, 0(BASE)
++ | stw CARG2, 4(BASE)
++ |.endif
+ | addi BASE, BASE, 8
+ | bney <1
+ |
+@@ -613,23 +657,23 @@ static void build_subroutines(BuildCtx *
+ | .toc ld TOCREG, SAVE_TOC
+ | li TISNUM, LJ_TISNUM // Setup type comparison constants.
+ | lp BASE, L->base
+- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | lwz DISPATCH, L->glref // Setup pointer to dispatch table.
+ | li ZERO, 0
+- | stw TMP3, TMPD
++ | .FPU stw TMP3, TMPD
+ | li TMP1, LJ_TFALSE
+- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
+ | li TISNIL, LJ_TNIL
+ | li_vmstate INTERP
+- | lfs TOBIT, TMPD
++ | .FPU lfs TOBIT, TMPD
+ | lwz PC, FRAME_PC(BASE) // Fetch PC of previous frame.
+ | la RA, -8(BASE) // Results start at BASE-8.
+- | stw TMP3, TMPD
++ | .FPU stw TMP3, TMPD
+ | addi DISPATCH, DISPATCH, GG_G2DISP
+ | stw TMP1, 0(RA) // Prepend false to error message.
+ | li RD, 16 // 2 results: false + error message.
+ | st_vmstate
+- | lfs TONUM, TMPD
++ | .FPU lfs TONUM, TMPD
+ | b ->vm_returnc
+ |
+ |//-----------------------------------------------------------------------
+@@ -690,22 +734,22 @@ static void build_subroutines(BuildCtx *
+ | li TISNUM, LJ_TISNUM // Setup type comparison constants.
+ | lp TMP1, L->top
+ | lwz PC, FRAME_PC(BASE)
+- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | stb CARG3, L->status
+- | stw TMP3, TMPD
+- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
+- | lfs TOBIT, TMPD
++ | .FPU stw TMP3, TMPD
++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
++ | .FPU lfs TOBIT, TMPD
+ | sub RD, TMP1, BASE
+- | stw TMP3, TMPD
+- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
++ | .FPU stw TMP3, TMPD
++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
+ | addi RD, RD, 8
+- | stw TMP0, TONUM_HI
++ | .FPU stw TMP0, TONUM_HI
+ | li_vmstate INTERP
+ | li ZERO, 0
+ | st_vmstate
+ | andix. TMP0, PC, FRAME_TYPE
+ | mr MULTRES, RD
+- | lfs TONUM, TMPD
++ | .FPU lfs TONUM, TMPD
+ | li TISNIL, LJ_TNIL
+ | beq ->BC_RET_Z
+ | b ->vm_return
+@@ -739,19 +783,19 @@ static void build_subroutines(BuildCtx *
+ | lp TMP2, L->base // TMP2 = old base (used in vmeta_call).
+ | li TISNUM, LJ_TISNUM // Setup type comparison constants.
+ | lp TMP1, L->top
+- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | add PC, PC, BASE
+- | stw TMP3, TMPD
++ | .FPU stw TMP3, TMPD
+ | li ZERO, 0
+- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
+- | lfs TOBIT, TMPD
++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
++ | .FPU lfs TOBIT, TMPD
+ | sub PC, PC, TMP2 // PC = frame delta + frame type
+- | stw TMP3, TMPD
+- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
++ | .FPU stw TMP3, TMPD
++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
+ | sub NARGS8:RC, TMP1, BASE
+- | stw TMP0, TONUM_HI
++ | .FPU stw TMP0, TONUM_HI
+ | li_vmstate INTERP
+- | lfs TONUM, TMPD
++ | .FPU lfs TONUM, TMPD
+ | li TISNIL, LJ_TNIL
+ | st_vmstate
+ |
+@@ -839,15 +883,30 @@ static void build_subroutines(BuildCtx *
+ | lwz INS, -4(PC)
+ | subi CARG2, RB, 16
+ | decode_RB8 SAVE0, INS
++ |.if FPU
+ | lfd f0, 0(RA)
++ |.else
++ | lwz TMP2, 0(RA)
++ | lwz TMP3, 4(RA)
++ |.endif
+ | add TMP1, BASE, SAVE0
+ | stp BASE, L->base
+ | cmplw TMP1, CARG2
+ | sub CARG3, CARG2, TMP1
+ | decode_RA8 RA, INS
++ |.if FPU
+ | stfd f0, 0(CARG2)
++ |.else
++ | stw TMP2, 0(CARG2)
++ | stw TMP3, 4(CARG2)
++ |.endif
+ | bney ->BC_CAT_Z
++ |.if FPU
+ | stfdx f0, BASE, RA
++ |.else
++ | stwux TMP2, RA, BASE
++ | stw TMP3, 4(RA)
++ |.endif
+ | b ->cont_nop
+ |
+ |//-- Table indexing metamethods -----------------------------------------
+@@ -900,9 +959,19 @@ static void build_subroutines(BuildCtx *
+ | // Returns TValue * (finished) or NULL (metamethod).
+ | cmplwi CRET1, 0
+ | beq >3
++ |.if FPU
+ | lfd f0, 0(CRET1)
++ |.else
++ | lwz TMP0, 0(CRET1)
++ | lwz TMP1, 4(CRET1)
++ |.endif
+ | ins_next1
++ |.if FPU
+ | stfdx f0, BASE, RA
++ |.else
++ | stwux TMP0, RA, BASE
++ | stw TMP1, 4(RA)
++ |.endif
+ | ins_next2
+ |
+ |3: // Call __index metamethod.
+@@ -920,7 +989,12 @@ static void build_subroutines(BuildCtx *
+ | // Returns cTValue * or NULL.
+ | cmplwi CRET1, 0
+ | beq >1
++ |.if FPU
+ | lfd f14, 0(CRET1)
++ |.else
++ | lwz SAVE0, 0(CRET1)
++ | lwz SAVE1, 4(CRET1)
++ |.endif
+ | b ->BC_TGETR_Z
+ |1:
+ | stwx TISNIL, BASE, RA
+@@ -975,11 +1049,21 @@ static void build_subroutines(BuildCtx *
+ | bl extern lj_meta_tset // (lua_State *L, TValue *o, TValue *k)
+ | // Returns TValue * (finished) or NULL (metamethod).
+ | cmplwi CRET1, 0
++ |.if FPU
+ | lfdx f0, BASE, RA
++ |.else
++ | lwzux TMP2, RA, BASE
++ | lwz TMP3, 4(RA)
++ |.endif
+ | beq >3
+ | // NOBARRIER: lj_meta_tset ensures the table is not black.
+ | ins_next1
++ |.if FPU
+ | stfd f0, 0(CRET1)
++ |.else
++ | stw TMP2, 0(CRET1)
++ | stw TMP3, 4(CRET1)
++ |.endif
+ | ins_next2
+ |
+ |3: // Call __newindex metamethod.
+@@ -990,7 +1074,12 @@ static void build_subroutines(BuildCtx *
+ | add PC, TMP1, BASE
+ | lwz LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here.
+ | li NARGS8:RC, 24 // 3 args for func(t, k, v)
++ |.if FPU
+ | stfd f0, 16(BASE) // Copy value to third argument.
++ |.else
++ | stw TMP2, 16(BASE)
++ | stw TMP3, 20(BASE)
++ |.endif
+ | b ->vm_call_dispatch_f
+ |
+ |->vmeta_tsetr:
+@@ -998,7 +1087,12 @@ static void build_subroutines(BuildCtx *
+ | stw PC, SAVE_PC
+ | bl extern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key)
+ | // Returns TValue *.
++ |.if FPU
+ | stfd f14, 0(CRET1)
++ |.else
++ | stw SAVE0, 0(CRET1)
++ | stw SAVE1, 4(CRET1)
++ |.endif
+ | b ->cont_nop
+ |
+ |//-- Comparison metamethods ---------------------------------------------
+@@ -1037,9 +1131,19 @@ static void build_subroutines(BuildCtx *
+ |
+ |->cont_ra: // RA = resultptr
+ | lwz INS, -4(PC)
++ |.if FPU
+ | lfd f0, 0(RA)
++ |.else
++ | lwz CARG1, 0(RA)
++ | lwz CARG2, 4(RA)
++ |.endif
+ | decode_RA8 TMP1, INS
++ |.if FPU
+ | stfdx f0, BASE, TMP1
++ |.else
++ | stwux CARG1, TMP1, BASE
++ | stw CARG2, 4(TMP1)
++ |.endif
+ | b ->cont_nop
+ |
+ |->cont_condt: // RA = resultptr
+@@ -1245,22 +1349,32 @@ static void build_subroutines(BuildCtx *
+ |.macro .ffunc_n, name
+ |->ff_ .. name:
+ | cmplwi NARGS8:RC, 8
+- | lwz CARG3, 0(BASE)
++ | lwz CARG1, 0(BASE)
++ |.if FPU
+ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG2, 4(BASE)
++ |.endif
+ | blt ->fff_fallback
+- | checknum CARG3; bge ->fff_fallback
++ | checknum CARG1; bge ->fff_fallback
+ |.endmacro
+ |
+ |.macro .ffunc_nn, name
+ |->ff_ .. name:
+ | cmplwi NARGS8:RC, 16
+- | lwz CARG3, 0(BASE)
++ | lwz CARG1, 0(BASE)
++ |.if FPU
+ | lfd FARG1, 0(BASE)
+- | lwz CARG4, 8(BASE)
++ | lwz CARG3, 8(BASE)
+ | lfd FARG2, 8(BASE)
++ |.else
++ | lwz CARG2, 4(BASE)
++ | lwz CARG3, 8(BASE)
++ | lwz CARG4, 12(BASE)
++ |.endif
+ | blt ->fff_fallback
++ | checknum CARG1; bge ->fff_fallback
+ | checknum CARG3; bge ->fff_fallback
+- | checknum CARG4; bge ->fff_fallback
+ |.endmacro
+ |
+ |// Inlined GC threshold check. Caveat: uses TMP0 and TMP1.
+@@ -1281,14 +1395,21 @@ static void build_subroutines(BuildCtx *
+ | bge cr1, ->fff_fallback
+ | stw CARG3, 0(RA)
+ | addi RD, NARGS8:RC, 8 // Compute (nresults+1)*8.
++ | addi TMP1, BASE, 8
++ | add TMP2, RA, NARGS8:RC
+ | stw CARG1, 4(RA)
+ | beq ->fff_res // Done if exactly 1 argument.
+- | li TMP1, 8
+- | subi RC, RC, 8
+ |1:
+- | cmplw TMP1, RC
+- | lfdx f0, BASE, TMP1
+- | stfdx f0, RA, TMP1
++ | cmplw TMP1, TMP2
++ |.if FPU
++ | lfd f0, 0(TMP1)
++ | stfd f0, 0(TMP1)
++ |.else
++ | lwz CARG1, 0(TMP1)
++ | lwz CARG2, 4(TMP1)
++ | stw CARG1, -8(TMP1)
++ | stw CARG2, -4(TMP1)
++ |.endif
+ | addi TMP1, TMP1, 8
+ | bney <1
+ | b ->fff_res
+@@ -1303,8 +1424,14 @@ static void build_subroutines(BuildCtx *
+ | orc TMP1, TMP2, TMP0
+ | addi TMP1, TMP1, ~LJ_TISNUM+1
+ | slwi TMP1, TMP1, 3
++ |.if FPU
+ | la TMP2, CFUNC:RB->upvalue
+ | lfdx FARG1, TMP2, TMP1
++ |.else
++ | add TMP1, CFUNC:RB, TMP1
++ | lwz CARG1, CFUNC:TMP1->upvalue[0].u32.hi
++ | lwz CARG2, CFUNC:TMP1->upvalue[0].u32.lo
++ |.endif
+ | b ->fff_resn
+ |
+ |//-- Base library: getters and setters ---------------------------------
+@@ -1382,7 +1509,12 @@ static void build_subroutines(BuildCtx *
+ | mr CARG1, L
+ | bl extern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key)
+ | // Returns cTValue *.
++ |.if FPU
+ | lfd FARG1, 0(CRET1)
++ |.else
++ | lwz CARG2, 4(CRET1)
++ | lwz CARG1, 0(CRET1) // Caveat: CARG1 == CRET1.
++ |.endif
+ | b ->fff_resn
+ |
+ |//-- Base library: conversions ------------------------------------------
+@@ -1391,7 +1523,11 @@ static void build_subroutines(BuildCtx *
+ | // Only handles the number case inline (without a base argument).
+ | cmplwi NARGS8:RC, 8
+ | lwz CARG1, 0(BASE)
++ |.if FPU
+ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG2, 4(BASE)
++ |.endif
+ | bne ->fff_fallback // Exactly one argument.
+ | checknum CARG1; bgt ->fff_fallback
+ | b ->fff_resn
+@@ -1442,12 +1578,23 @@ static void build_subroutines(BuildCtx *
+ | cmplwi CRET1, 0
+ | li CARG3, LJ_TNIL
+ | beq ->fff_restv // End of traversal: return nil.
+- | lfd f0, 8(BASE) // Copy key and value to results.
+ | la RA, -8(BASE)
++ |.if FPU
++ | lfd f0, 8(BASE) // Copy key and value to results.
+ | lfd f1, 16(BASE)
+ | stfd f0, 0(RA)
+- | li RD, (2+1)*8
+ | stfd f1, 8(RA)
++ |.else
++ | lwz CARG1, 8(BASE)
++ | lwz CARG2, 12(BASE)
++ | lwz CARG3, 16(BASE)
++ | lwz CARG4, 20(BASE)
++ | stw CARG1, 0(RA)
++ | stw CARG2, 4(RA)
++ | stw CARG3, 8(RA)
++ | stw CARG4, 12(RA)
++ |.endif
++ | li RD, (2+1)*8
+ | b ->fff_res
+ |
+ |.ffunc_1 pairs
+@@ -1456,17 +1603,32 @@ static void build_subroutines(BuildCtx *
+ | bne ->fff_fallback
+ #if LJ_52
+ | lwz TAB:TMP2, TAB:CARG1->metatable
++ |.if FPU
+ | lfd f0, CFUNC:RB->upvalue[0]
++ |.else
++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi
++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo
++ |.endif
+ | cmplwi TAB:TMP2, 0
+ | la RA, -8(BASE)
+ | bne ->fff_fallback
+ #else
++ |.if FPU
+ | lfd f0, CFUNC:RB->upvalue[0]
++ |.else
++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi
++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo
++ |.endif
+ | la RA, -8(BASE)
+ #endif
+ | stw TISNIL, 8(BASE)
+ | li RD, (3+1)*8
++ |.if FPU
+ | stfd f0, 0(RA)
++ |.else
++ | stw TMP0, 0(RA)
++ | stw TMP1, 4(RA)
++ |.endif
+ | b ->fff_res
+ |
+ |.ffunc ipairs_aux
+@@ -1512,14 +1674,24 @@ static void build_subroutines(BuildCtx *
+ | stfd FARG2, 0(RA)
+ |.endif
+ | ble >2 // Not in array part?
++ |.if FPU
+ | lwzx TMP2, TMP1, TMP3
+ | lfdx f0, TMP1, TMP3
++ |.else
++ | lwzux TMP2, TMP1, TMP3
++ | lwz TMP3, 4(TMP1)
++ |.endif
+ |1:
+ | checknil TMP2
+ | li RD, (0+1)*8
+ | beq ->fff_res // End of iteration, return 0 results.
+ | li RD, (2+1)*8
++ |.if FPU
+ | stfd f0, 8(RA)
++ |.else
++ | stw TMP2, 8(RA)
++ | stw TMP3, 12(RA)
++ |.endif
+ | b ->fff_res
+ |2: // Check for empty hash part first. Otherwise call C function.
+ | lwz TMP0, TAB:CARG1->hmask
+@@ -1533,7 +1705,11 @@ static void build_subroutines(BuildCtx *
+ | li RD, (0+1)*8
+ | beq ->fff_res
+ | lwz TMP2, 0(CRET1)
++ |.if FPU
+ | lfd f0, 0(CRET1)
++ |.else
++ | lwz TMP3, 4(CRET1)
++ |.endif
+ | b <1
+ |
+ |.ffunc_1 ipairs
+@@ -1542,12 +1718,22 @@ static void build_subroutines(BuildCtx *
+ | bne ->fff_fallback
+ #if LJ_52
+ | lwz TAB:TMP2, TAB:CARG1->metatable
++ |.if FPU
+ | lfd f0, CFUNC:RB->upvalue[0]
++ |.else
++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi
++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo
++ |.endif
+ | cmplwi TAB:TMP2, 0
+ | la RA, -8(BASE)
+ | bne ->fff_fallback
+ #else
++ |.if FPU
+ | lfd f0, CFUNC:RB->upvalue[0]
++ |.else
++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi
++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo
++ |.endif
+ | la RA, -8(BASE)
+ #endif
+ |.if DUALNUM
+@@ -1557,7 +1743,12 @@ static void build_subroutines(BuildCtx *
+ |.endif
+ | stw ZERO, 12(BASE)
+ | li RD, (3+1)*8
++ |.if FPU
+ | stfd f0, 0(RA)
++ |.else
++ | stw TMP0, 0(RA)
++ | stw TMP1, 4(RA)
++ |.endif
+ | b ->fff_res
+ |
+ |//-- Base library: catch errors ----------------------------------------
+@@ -1576,19 +1767,32 @@ static void build_subroutines(BuildCtx *
+ |
+ |.ffunc xpcall
+ | cmplwi NARGS8:RC, 16
+- | lwz CARG4, 8(BASE)
++ | lwz CARG3, 8(BASE)
++ |.if FPU
+ | lfd FARG2, 8(BASE)
+ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG1, 0(BASE)
++ | lwz CARG2, 4(BASE)
++ | lwz CARG4, 12(BASE)
++ |.endif
+ | blt ->fff_fallback
+ | lbz TMP1, DISPATCH_GL(hookmask)(DISPATCH)
+ | mr TMP2, BASE
+- | checkfunc CARG4; bne ->fff_fallback // Traceback must be a function.
++ | checkfunc CARG3; bne ->fff_fallback // Traceback must be a function.
+ | la BASE, 16(BASE)
+ | // Remember active hook before pcall.
+ | rlwinm TMP1, TMP1, 32-HOOK_ACTIVE_SHIFT, 31, 31
++ |.if FPU
+ | stfd FARG2, 0(TMP2) // Swap function and traceback.
+- | subi NARGS8:RC, NARGS8:RC, 16
+ | stfd FARG1, 8(TMP2)
++ |.else
++ | stw CARG3, 0(TMP2)
++ | stw CARG4, 4(TMP2)
++ | stw CARG1, 8(TMP2)
++ | stw CARG2, 12(TMP2)
++ |.endif
++ | subi NARGS8:RC, NARGS8:RC, 16
+ | addi PC, TMP1, 16+FRAME_PCALL
+ | b ->vm_call_dispatch
+ |
+@@ -1631,9 +1835,21 @@ static void build_subroutines(BuildCtx *
+ | stp BASE, L->top
+ |2: // Move args to coroutine.
+ | cmpw TMP1, NARGS8:RC
++ |.if FPU
+ | lfdx f0, BASE, TMP1
++ |.else
++ | add CARG3, BASE, TMP1
++ | lwz TMP2, 0(CARG3)
++ | lwz TMP3, 4(CARG3)
++ |.endif
+ | beq >3
++ |.if FPU
+ | stfdx f0, CARG2, TMP1
++ |.else
++ | add CARG3, CARG2, TMP1
++ | stw TMP2, 0(CARG3)
++ | stw TMP3, 4(CARG3)
++ |.endif
+ | addi TMP1, TMP1, 8
+ | b <2
+ |3:
+@@ -1664,8 +1880,17 @@ static void build_subroutines(BuildCtx *
+ | stp TMP2, L:SAVE0->top // Clear coroutine stack.
+ |5: // Move results from coroutine.
+ | cmplw TMP1, TMP3
++ |.if FPU
+ | lfdx f0, TMP2, TMP1
+ | stfdx f0, BASE, TMP1
++ |.else
++ | add CARG3, TMP2, TMP1
++ | lwz CARG1, 0(CARG3)
++ | lwz CARG2, 4(CARG3)
++ | add CARG3, BASE, TMP1
++ | stw CARG1, 0(CARG3)
++ | stw CARG2, 4(CARG3)
++ |.endif
+ | addi TMP1, TMP1, 8
+ | bne <5
+ |6:
+@@ -1690,12 +1915,22 @@ static void build_subroutines(BuildCtx *
+ | andix. TMP0, PC, FRAME_TYPE
+ | la TMP3, -8(TMP3)
+ | li TMP1, LJ_TFALSE
++ |.if FPU
+ | lfd f0, 0(TMP3)
++ |.else
++ | lwz CARG1, 0(TMP3)
++ | lwz CARG2, 4(TMP3)
++ |.endif
+ | stp TMP3, L:SAVE0->top // Remove error from coroutine stack.
+ | li RD, (2+1)*8
+ | stw TMP1, -8(BASE) // Prepend false to results.
+ | la RA, -8(BASE)
++ |.if FPU
+ | stfd f0, 0(BASE) // Copy error message.
++ |.else
++ | stw CARG1, 0(BASE) // Copy error message.
++ | stw CARG2, 4(BASE)
++ |.endif
+ | b <7
+ |.else
+ | mr CARG1, L
+@@ -1874,7 +2109,12 @@ static void build_subroutines(BuildCtx *
+ | lus CARG1, 0x8000 // -(2^31).
+ | beqy ->fff_resi
+ |5:
++ |.if FPU
+ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG1, 0(BASE)
++ | lwz CARG2, 4(BASE)
++ |.endif
+ | blex func
+ | b ->fff_resn
+ |.endmacro
+@@ -1898,10 +2138,14 @@ static void build_subroutines(BuildCtx *
+ |
+ |.ffunc math_log
+ | cmplwi NARGS8:RC, 8
+- | lwz CARG3, 0(BASE)
+- | lfd FARG1, 0(BASE)
++ | lwz CARG1, 0(BASE)
+ | bne ->fff_fallback // Need exactly 1 argument.
+- | checknum CARG3; bge ->fff_fallback
++ | checknum CARG1; bge ->fff_fallback
++ |.if FPU
++ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG2, 4(BASE)
++ |.endif
+ | blex log
+ | b ->fff_resn
+ |
+@@ -1923,17 +2167,24 @@ static void build_subroutines(BuildCtx *
+ |.if DUALNUM
+ |.ffunc math_ldexp
+ | cmplwi NARGS8:RC, 16
+- | lwz CARG3, 0(BASE)
++ | lwz TMP0, 0(BASE)
++ |.if FPU
+ | lfd FARG1, 0(BASE)
+- | lwz CARG4, 8(BASE)
++ |.else
++ | lwz CARG1, 0(BASE)
++ | lwz CARG2, 4(BASE)
++ |.endif
++ | lwz TMP1, 8(BASE)
+ |.if GPR64
+ | lwz CARG2, 12(BASE)
+- |.else
++ |.elif FPU
+ | lwz CARG1, 12(BASE)
++ |.else
++ | lwz CARG3, 12(BASE)
+ |.endif
+ | blt ->fff_fallback
+- | checknum CARG3; bge ->fff_fallback
+- | checknum CARG4; bne ->fff_fallback
++ | checknum TMP0; bge ->fff_fallback
++ | checknum TMP1; bne ->fff_fallback
+ |.else
+ |.ffunc_nn math_ldexp
+ |.if GPR64
+@@ -1948,8 +2199,10 @@ static void build_subroutines(BuildCtx *
+ |.ffunc_n math_frexp
+ |.if GPR64
+ | la CARG2, DISPATCH_GL(tmptv)(DISPATCH)
+- |.else
++ |.elif FPU
+ | la CARG1, DISPATCH_GL(tmptv)(DISPATCH)
++ |.else
++ | la CARG3, DISPATCH_GL(tmptv)(DISPATCH)
+ |.endif
+ | lwz PC, FRAME_PC(BASE)
+ | blex frexp
+@@ -1958,7 +2211,12 @@ static void build_subroutines(BuildCtx *
+ |.if not DUALNUM
+ | tonum_i FARG2, TMP1
+ |.endif
++ |.if FPU
+ | stfd FARG1, 0(RA)
++ |.else
++ | stw CRET1, 0(RA)
++ | stw CRET2, 4(RA)
++ |.endif
+ | li RD, (2+1)*8
+ |.if DUALNUM
+ | stw TISNUM, 8(RA)
+@@ -1971,13 +2229,20 @@ static void build_subroutines(BuildCtx *
+ |.ffunc_n math_modf
+ |.if GPR64
+ | la CARG2, -8(BASE)
+- |.else
++ |.elif FPU
+ | la CARG1, -8(BASE)
++ |.else
++ | la CARG3, -8(BASE)
+ |.endif
+ | lwz PC, FRAME_PC(BASE)
+ | blex modf
+ | la RA, -8(BASE)
++ |.if FPU
+ | stfd FARG1, 0(BASE)
++ |.else
++ | stw CRET1, 0(BASE)
++ | stw CRET2, 4(BASE)
++ |.endif
+ | li RD, (2+1)*8
+ | b ->fff_res
+ |
+@@ -1985,13 +2250,13 @@ static void build_subroutines(BuildCtx *
+ |.if DUALNUM
+ | .ffunc_1 name
+ | checknum CARG3
+- | addi TMP1, BASE, 8
+- | add TMP2, BASE, NARGS8:RC
++ | addi SAVE0, BASE, 8
++ | add SAVE1, BASE, NARGS8:RC
+ | bne >4
+ |1: // Handle integers.
+- | lwz CARG4, 0(TMP1)
+- | cmplw cr1, TMP1, TMP2
+- | lwz CARG2, 4(TMP1)
++ | lwz CARG4, 0(SAVE0)
++ | cmplw cr1, SAVE0, SAVE1
++ | lwz CARG2, 4(SAVE0)
+ | bge cr1, ->fff_resi
+ | checknum CARG4
+ | xoris TMP0, CARG1, 0x8000
+@@ -2008,36 +2273,76 @@ static void build_subroutines(BuildCtx *
+ |.if GPR64
+ | rldicl CARG1, CARG1, 0, 32
+ |.endif
+- | addi TMP1, TMP1, 8
++ | addi SAVE0, SAVE0, 8
+ | b <1
+ |3:
+ | bge ->fff_fallback
+ | // Convert intermediate result to number and continue below.
++ |.if FPU
+ | tonum_i FARG1, CARG1
+- | lfd FARG2, 0(TMP1)
++ | lfd FARG2, 0(SAVE0)
++ |.else
++ | mr CARG2, CARG1
++ | bl ->vm_sfi2d_1
++ | lwz CARG3, 0(SAVE0)
++ | lwz CARG4, 4(SAVE0)
++ |.endif
+ | b >6
+ |4:
++ |.if FPU
+ | lfd FARG1, 0(BASE)
++ |.else
++ | lwz CARG1, 0(BASE)
++ | lwz CARG2, 4(BASE)
++ |.endif
+ | bge ->fff_fallback
+ |5: // Handle numbers.
+- | lwz CARG4, 0(TMP1)
+- | cmplw cr1, TMP1, TMP2
+- | lfd FARG2, 0(TMP1)
++ | lwz CARG3, 0(SAVE0)
++ | cmplw cr1, SAVE0, SAVE1
++ |.if FPU
++ | lfd FARG2, 0(SAVE0)
++ |.else
++ | lwz CARG4, 4(SAVE0)
++ |.endif
+ | bge cr1, ->fff_resn
+- | checknum CARG4; bge >7
++ | checknum CARG3; bge >7
+ |6:
++ | addi SAVE0, SAVE0, 8
++ |.if FPU
+ | fsub f0, FARG1, FARG2
+- | addi TMP1, TMP1, 8
+ |.if ismax
+ | fsel FARG1, f0, FARG1, FARG2
+ |.else
+ | fsel FARG1, f0, FARG2, FARG1
+ |.endif
++ |.else
++ | stw CARG1, SFSAVE_1
++ | stw CARG2, SFSAVE_2
++ | stw CARG3, SFSAVE_3
++ | stw CARG4, SFSAVE_4
++ | blex __ledf2
++ | cmpwi CRET1, 0
++ |.if ismax
++ | blt >8
++ |.else
++ | bge >8
++ |.endif
++ | lwz CARG1, SFSAVE_1
++ | lwz CARG2, SFSAVE_2
++ | b <5
++ |8:
++ | lwz CARG1, SFSAVE_3
++ | lwz CARG2, SFSAVE_4
++ |.endif
+ | b <5
+ |7: // Convert integer to number and continue above.
+- | lwz CARG2, 4(TMP1)
++ | lwz CARG3, 4(SAVE0)
+ | bne ->fff_fallback
+- | tonum_i FARG2, CARG2
++ |.if FPU
++ | tonum_i FARG2, CARG3
++ |.else
++ | bl ->vm_sfi2d_2
++ |.endif
+ | b <6
+ |.else
+ | .ffunc_n name
+@@ -2237,28 +2542,37 @@ static void build_subroutines(BuildCtx *
+ |
+ |.macro .ffunc_bit_op, name, ins
+ | .ffunc_bit name
+- | addi TMP1, BASE, 8
+- | add TMP2, BASE, NARGS8:RC
++ | addi SAVE0, BASE, 8
++ | add SAVE1, BASE, NARGS8:RC
+ |1:
+- | lwz CARG4, 0(TMP1)
+- | cmplw cr1, TMP1, TMP2
++ | lwz CARG4, 0(SAVE0)
++ | cmplw cr1, SAVE0, SAVE1
+ |.if DUALNUM
+- | lwz CARG2, 4(TMP1)
++ | lwz CARG2, 4(SAVE0)
+ |.else
+- | lfd FARG1, 0(TMP1)
++ | lfd FARG1, 0(SAVE0)
+ |.endif
+ | bgey cr1, ->fff_resi
+ | checknum CARG4
+ |.if DUALNUM
++ |.if FPU
+ | bnel ->fff_bitop_fb
+ |.else
++ | beq >3
++ | stw CARG1, SFSAVE_1
++ | bl ->fff_bitop_fb
++ | mr CARG2, CARG1
++ | lwz CARG1, SFSAVE_1
++ |3:
++ |.endif
++ |.else
+ | fadd FARG1, FARG1, TOBIT
+ | bge ->fff_fallback
+ | stfd FARG1, TMPD
+ | lwz CARG2, TMPD_LO
+ |.endif
+ | ins CARG1, CARG1, CARG2
+- | addi TMP1, TMP1, 8
++ | addi SAVE0, SAVE0, 8
+ | b <1
+ |.endmacro
+ |
+@@ -2280,7 +2594,14 @@ static void build_subroutines(BuildCtx *
+ |.macro .ffunc_bit_sh, name, ins, shmod
+ |.if DUALNUM
+ | .ffunc_2 bit_..name
++ |.if FPU
+ | checknum CARG3; bnel ->fff_tobit_fb
++ |.else
++ | checknum CARG3; beq >1
++ | bl ->fff_tobit_fb
++ | lwz CARG2, 12(BASE) // Conversion polluted CARG2.
++ |1:
++ |.endif
+ | // Note: no inline conversion from number for 2nd argument!
+ | checknum CARG4; bne ->fff_fallback
+ |.else
+@@ -2317,27 +2638,77 @@ static void build_subroutines(BuildCtx *
+ |->fff_resn:
+ | lwz PC, FRAME_PC(BASE)
+ | la RA, -8(BASE)
++ |.if FPU
+ | stfd FARG1, -8(BASE)
++ |.else
++ | stw CARG1, -8(BASE)
++ | stw CARG2, -4(BASE)
++ |.endif
+ | b ->fff_res1
+ |
+ |// Fallback FP number to bit conversion.
+ |->fff_tobit_fb:
+ |.if DUALNUM
++ |.if FPU
+ | lfd FARG1, 0(BASE)
+ | bgt ->fff_fallback
+ | fadd FARG1, FARG1, TOBIT
+ | stfd FARG1, TMPD
+ | lwz CARG1, TMPD_LO
+ | blr
++ |.else
++ | bgt ->fff_fallback
++ | mr CARG2, CARG1
++ | mr CARG1, CARG3
++ |// Modifies: CARG1, CARG2, TMP0, TMP1, TMP2.
++ |->vm_tobit:
++ | slwi TMP2, CARG1, 1
++ | addis TMP2, TMP2, 0x0020
++ | cmpwi TMP2, 0
++ | bge >2
++ | li TMP1, 0x3e0
++ | srawi TMP2, TMP2, 21
++ | not TMP1, TMP1
++ | sub. TMP2, TMP1, TMP2
++ | cmpwi cr7, CARG1, 0
++ | blt >1
++ | slwi TMP1, CARG1, 11
++ | srwi TMP0, CARG2, 21
++ | oris TMP1, TMP1, 0x8000
++ | or TMP1, TMP1, TMP0
++ | srw CARG1, TMP1, TMP2
++ | bclr 4, 28 // Return if cr7[lt] == 0, no hint.
++ | neg CARG1, CARG1
++ | blr
++ |1:
++ | addi TMP2, TMP2, 21
++ | srw TMP1, CARG2, TMP2
++ | slwi CARG2, CARG1, 12
++ | subfic TMP2, TMP2, 20
++ | slw TMP0, CARG2, TMP2
++ | or CARG1, TMP1, TMP0
++ | bclr 4, 28 // Return if cr7[lt] == 0, no hint.
++ | neg CARG1, CARG1
++ | blr
++ |2:
++ | li CARG1, 0
++ | blr
++ |.endif
+ |.endif
+ |->fff_bitop_fb:
+ |.if DUALNUM
+- | lfd FARG1, 0(TMP1)
++ |.if FPU
++ | lfd FARG1, 0(SAVE0)
+ | bgt ->fff_fallback
+ | fadd FARG1, FARG1, TOBIT
+ | stfd FARG1, TMPD
+ | lwz CARG2, TMPD_LO
+ | blr
++ |.else
++ | bgt ->fff_fallback
++ | mr CARG1, CARG4
++ | b ->vm_tobit
++ |.endif
+ |.endif
+ |
+ |//-----------------------------------------------------------------------
+@@ -2530,10 +2901,21 @@ static void build_subroutines(BuildCtx *
+ | decode_RA8 RC, INS // Call base.
+ | beq >2
+ |1: // Move results down.
++ |.if FPU
+ | lfd f0, 0(RA)
++ |.else
++ | lwz CARG1, 0(RA)
++ | lwz CARG2, 4(RA)
++ |.endif
+ | addic. TMP1, TMP1, -8
+ | addi RA, RA, 8
++ |.if FPU
+ | stfdx f0, BASE, RC
++ |.else
++ | add CARG3, BASE, RC
++ | stw CARG1, 0(CARG3)
++ | stw CARG2, 4(CARG3)
++ |.endif
+ | addi RC, RC, 8
+ | bne <1
+ |2:
+@@ -2586,10 +2968,12 @@ static void build_subroutines(BuildCtx *
+ |//-----------------------------------------------------------------------
+ |
+ |.macro savex_, a, b, c, d
++ |.if FPU
+ | stfd f..a, 16+a*8(sp)
+ | stfd f..b, 16+b*8(sp)
+ | stfd f..c, 16+c*8(sp)
+ | stfd f..d, 16+d*8(sp)
++ |.endif
+ |.endmacro
+ |
+ |->vm_exit_handler:
+@@ -2661,16 +3045,16 @@ static void build_subroutines(BuildCtx *
+ | lwz KBASE, PC2PROTO(k)(TMP1)
+ | // Setup type comparison constants.
+ | li TISNUM, LJ_TISNUM
+- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+- | stw TMP3, TMPD
++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
++ | .FPU stw TMP3, TMPD
+ | li ZERO, 0
+- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
+- | lfs TOBIT, TMPD
+- | stw TMP3, TMPD
+- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
++ | .FPU lfs TOBIT, TMPD
++ | .FPU stw TMP3, TMPD
++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
+ | li TISNIL, LJ_TNIL
+- | stw TMP0, TONUM_HI
+- | lfs TONUM, TMPD
++ | .FPU stw TMP0, TONUM_HI
++ | .FPU lfs TONUM, TMPD
+ | // Modified copy of ins_next which handles function header dispatch, too.
+ | lwz INS, 0(PC)
+ | addi PC, PC, 4
+@@ -2715,7 +3099,35 @@ static void build_subroutines(BuildCtx *
+ |//-- Math helper functions ----------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+- |// NYI: Use internal implementations of floor, ceil, trunc.
++ |// NYI: Use internal implementations of floor, ceil, trunc, sfcmp.
++ |
++ |.macro sfi2d, AHI, ALO
++ |.if not FPU
++ | mr. AHI, ALO
++ | bclr 12, 2 // Handle zero first.
++ | srawi TMP0, ALO, 31
++ | xor TMP1, ALO, TMP0
++ | sub TMP1, TMP1, TMP0 // Absolute value in TMP1.
++ | cntlzw AHI, TMP1
++ | andix. TMP0, TMP0, 0x800 // Mask sign bit.
++ | slw TMP1, TMP1, AHI // Align mantissa left with leading 1.
++ | subfic AHI, AHI, 0x3ff+31-1 // Exponent -1 in AHI.
++ | slwi ALO, TMP1, 21
++ | or AHI, AHI, TMP0 // Sign | Exponent.
++ | srwi TMP1, TMP1, 11
++ | slwi AHI, AHI, 20 // Align left.
++ | add AHI, AHI, TMP1 // Add mantissa, increment exponent.
++ | blr
++ |.endif
++ |.endmacro
++ |
++ |// Input: CARG2. Output: CARG1, CARG2. Temporaries: TMP0, TMP1.
++ |->vm_sfi2d_1:
++ | sfi2d CARG1, CARG2
++ |
++ |// Input: CARG4. Output: CARG3, CARG4. Temporaries: TMP0, TMP1.
++ |->vm_sfi2d_2:
++ | sfi2d CARG3, CARG4
+ |
+ |->vm_modi:
+ | divwo. TMP0, CARG1, CARG2
+@@ -2783,21 +3195,21 @@ static void build_subroutines(BuildCtx *
+ | addi DISPATCH, r12, GG_G2DISP
+ | stw r11, CTSTATE->cb.slot
+ | stw r3, CTSTATE->cb.gpr[0]
+- | stfd f1, CTSTATE->cb.fpr[0]
++ | .FPU stfd f1, CTSTATE->cb.fpr[0]
+ | stw r4, CTSTATE->cb.gpr[1]
+- | stfd f2, CTSTATE->cb.fpr[1]
++ | .FPU stfd f2, CTSTATE->cb.fpr[1]
+ | stw r5, CTSTATE->cb.gpr[2]
+- | stfd f3, CTSTATE->cb.fpr[2]
++ | .FPU stfd f3, CTSTATE->cb.fpr[2]
+ | stw r6, CTSTATE->cb.gpr[3]
+- | stfd f4, CTSTATE->cb.fpr[3]
++ | .FPU stfd f4, CTSTATE->cb.fpr[3]
+ | stw r7, CTSTATE->cb.gpr[4]
+- | stfd f5, CTSTATE->cb.fpr[4]
++ | .FPU stfd f5, CTSTATE->cb.fpr[4]
+ | stw r8, CTSTATE->cb.gpr[5]
+- | stfd f6, CTSTATE->cb.fpr[5]
++ | .FPU stfd f6, CTSTATE->cb.fpr[5]
+ | stw r9, CTSTATE->cb.gpr[6]
+- | stfd f7, CTSTATE->cb.fpr[6]
++ | .FPU stfd f7, CTSTATE->cb.fpr[6]
+ | stw r10, CTSTATE->cb.gpr[7]
+- | stfd f8, CTSTATE->cb.fpr[7]
++ | .FPU stfd f8, CTSTATE->cb.fpr[7]
+ | addi TMP0, sp, CFRAME_SPACE+8
+ | stw TMP0, CTSTATE->cb.stack
+ | mr CARG1, CTSTATE
+@@ -2808,21 +3220,21 @@ static void build_subroutines(BuildCtx *
+ | lp BASE, L:CRET1->base
+ | li TISNUM, LJ_TISNUM // Setup type comparison constants.
+ | lp RC, L:CRET1->top
+- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | li ZERO, 0
+ | mr L, CRET1
+- | stw TMP3, TMPD
+- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
++ | .FPU stw TMP3, TMPD
++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double)
+ | lwz LFUNC:RB, FRAME_FUNC(BASE)
+- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
+- | stw TMP0, TONUM_HI
++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float).
++ | .FPU stw TMP0, TONUM_HI
+ | li TISNIL, LJ_TNIL
+ | li_vmstate INTERP
+- | lfs TOBIT, TMPD
+- | stw TMP3, TMPD
++ | .FPU lfs TOBIT, TMPD
++ | .FPU stw TMP3, TMPD
+ | sub RC, RC, BASE
+ | st_vmstate
+- | lfs TONUM, TMPD
++ | .FPU lfs TONUM, TMPD
+ | ins_callt
+ |.endif
+ |
+@@ -2836,7 +3248,7 @@ static void build_subroutines(BuildCtx *
+ | mr CARG2, RA
+ | bl extern lj_ccallback_leave // (CTState *cts, TValue *o)
+ | lwz CRET1, CTSTATE->cb.gpr[0]
+- | lfd FARG1, CTSTATE->cb.fpr[0]
++ | .FPU lfd FARG1, CTSTATE->cb.fpr[0]
+ | lwz CRET2, CTSTATE->cb.gpr[1]
+ | b ->vm_leave_unw
+ |.endif
+@@ -2870,14 +3282,14 @@ static void build_subroutines(BuildCtx *
+ | bge <1
+ |2:
+ | bney cr1, >3
+- | lfd f1, CCSTATE->fpr[0]
+- | lfd f2, CCSTATE->fpr[1]
+- | lfd f3, CCSTATE->fpr[2]
+- | lfd f4, CCSTATE->fpr[3]
+- | lfd f5, CCSTATE->fpr[4]
+- | lfd f6, CCSTATE->fpr[5]
+- | lfd f7, CCSTATE->fpr[6]
+- | lfd f8, CCSTATE->fpr[7]
++ | .FPU lfd f1, CCSTATE->fpr[0]
++ | .FPU lfd f2, CCSTATE->fpr[1]
++ | .FPU lfd f3, CCSTATE->fpr[2]
++ | .FPU lfd f4, CCSTATE->fpr[3]
++ | .FPU lfd f5, CCSTATE->fpr[4]
++ | .FPU lfd f6, CCSTATE->fpr[5]
++ | .FPU lfd f7, CCSTATE->fpr[6]
++ | .FPU lfd f8, CCSTATE->fpr[7]
+ |3:
+ | lp TMP0, CCSTATE->func
+ | lwz CARG2, CCSTATE->gpr[1]
+@@ -2894,7 +3306,7 @@ static void build_subroutines(BuildCtx *
+ | lwz TMP2, -4(r14)
+ | lwz TMP0, 4(r14)
+ | stw CARG1, CCSTATE:TMP1->gpr[0]
+- | stfd FARG1, CCSTATE:TMP1->fpr[0]
++ | .FPU stfd FARG1, CCSTATE:TMP1->fpr[0]
+ | stw CARG2, CCSTATE:TMP1->gpr[1]
+ | mtlr TMP0
+ | stw CARG3, CCSTATE:TMP1->gpr[2]
+@@ -2923,19 +3335,19 @@ static void build_ins(BuildCtx *ctx, BCO
+ case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
+ | // RA = src1*8, RD = src2*8, JMP with RD = target
+ |.if DUALNUM
+- | lwzux TMP0, RA, BASE
++ | lwzux CARG1, RA, BASE
+ | addi PC, PC, 4
+ | lwz CARG2, 4(RA)
+- | lwzux TMP1, RD, BASE
++ | lwzux CARG3, RD, BASE
+ | lwz TMP2, -4(PC)
+- | checknum cr0, TMP0
+- | lwz CARG3, 4(RD)
++ | checknum cr0, CARG1
++ | lwz CARG4, 4(RD)
+ | decode_RD4 TMP2, TMP2
+- | checknum cr1, TMP1
+- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16)
++ | checknum cr1, CARG3
++ | addis SAVE0, TMP2, -(BCBIAS_J*4 >> 16)
+ | bne cr0, >7
+ | bne cr1, >8
+- | cmpw CARG2, CARG3
++ | cmpw CARG2, CARG4
+ if (op == BC_ISLT) {
+ | bge >2
+ } else if (op == BC_ISGE) {
+@@ -2946,28 +3358,41 @@ static void build_ins(BuildCtx *ctx, BCO
+ | ble >2
+ }
+ |1:
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ |2:
+ | ins_next
+ |
+ |7: // RA is not an integer.
+ | bgt cr0, ->vmeta_comp
+ | // RA is a number.
+- | lfd f0, 0(RA)
++ | .FPU lfd f0, 0(RA)
+ | bgt cr1, ->vmeta_comp
+ | blt cr1, >4
+ | // RA is a number, RD is an integer.
+- | tonum_i f1, CARG3
++ |.if FPU
++ | tonum_i f1, CARG4
++ |.else
++ | bl ->vm_sfi2d_2
++ |.endif
+ | b >5
+ |
+ |8: // RA is an integer, RD is not an integer.
+ | bgt cr1, ->vmeta_comp
+ | // RA is an integer, RD is a number.
++ |.if FPU
+ | tonum_i f0, CARG2
++ |.else
++ | bl ->vm_sfi2d_1
++ |.endif
+ |4:
+- | lfd f1, 0(RD)
++ | .FPU lfd f1, 0(RD)
+ |5:
++ |.if FPU
+ | fcmpu cr0, f0, f1
++ |.else
++ | blex __ledf2
++ | cmpwi CRET1, 0
++ |.endif
+ if (op == BC_ISLT) {
+ | bge <2
+ } else if (op == BC_ISGE) {
+@@ -3015,42 +3440,42 @@ static void build_ins(BuildCtx *ctx, BCO
+ vk = op == BC_ISEQV;
+ | // RA = src1*8, RD = src2*8, JMP with RD = target
+ |.if DUALNUM
+- | lwzux TMP0, RA, BASE
++ | lwzux CARG1, RA, BASE
+ | addi PC, PC, 4
+ | lwz CARG2, 4(RA)
+- | lwzux TMP1, RD, BASE
+- | checknum cr0, TMP0
+- | lwz TMP2, -4(PC)
+- | checknum cr1, TMP1
+- | decode_RD4 TMP2, TMP2
+- | lwz CARG3, 4(RD)
++ | lwzux CARG3, RD, BASE
++ | checknum cr0, CARG1
++ | lwz SAVE0, -4(PC)
++ | checknum cr1, CARG3
++ | decode_RD4 SAVE0, SAVE0
++ | lwz CARG4, 4(RD)
+ | cror 4*cr7+gt, 4*cr0+gt, 4*cr1+gt
+- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16)
++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16)
+ if (vk) {
+ | ble cr7, ->BC_ISEQN_Z
+ } else {
+ | ble cr7, ->BC_ISNEN_Z
+ }
+ |.else
+- | lwzux TMP0, RA, BASE
+- | lwz TMP2, 0(PC)
++ | lwzux CARG1, RA, BASE
++ | lwz SAVE0, 0(PC)
+ | lfd f0, 0(RA)
+ | addi PC, PC, 4
+- | lwzux TMP1, RD, BASE
+- | checknum cr0, TMP0
+- | decode_RD4 TMP2, TMP2
++ | lwzux CARG3, RD, BASE
++ | checknum cr0, CARG1
++ | decode_RD4 SAVE0, SAVE0
+ | lfd f1, 0(RD)
+- | checknum cr1, TMP1
+- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16)
++ | checknum cr1, CARG3
++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16)
+ | bge cr0, >5
+ | bge cr1, >5
+ | fcmpu cr0, f0, f1
+ if (vk) {
+ | bne >1
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ } else {
+ | beq >1
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ }
+ |1:
+ | ins_next
+@@ -3058,36 +3483,36 @@ static void build_ins(BuildCtx *ctx, BCO
+ |5: // Either or both types are not numbers.
+ |.if not DUALNUM
+ | lwz CARG2, 4(RA)
+- | lwz CARG3, 4(RD)
++ | lwz CARG4, 4(RD)
+ |.endif
+ |.if FFI
+- | cmpwi cr7, TMP0, LJ_TCDATA
+- | cmpwi cr5, TMP1, LJ_TCDATA
++ | cmpwi cr7, CARG1, LJ_TCDATA
++ | cmpwi cr5, CARG3, LJ_TCDATA
+ |.endif
+- | not TMP3, TMP0
+- | cmplw TMP0, TMP1
+- | cmplwi cr1, TMP3, ~LJ_TISPRI // Primitive?
++ | not TMP2, CARG1
++ | cmplw CARG1, CARG3
++ | cmplwi cr1, TMP2, ~LJ_TISPRI // Primitive?
+ |.if FFI
+ | cror 4*cr7+eq, 4*cr7+eq, 4*cr5+eq
+ |.endif
+- | cmplwi cr6, TMP3, ~LJ_TISTABUD // Table or userdata?
++ | cmplwi cr6, TMP2, ~LJ_TISTABUD // Table or userdata?
+ |.if FFI
+ | beq cr7, ->vmeta_equal_cd
+ |.endif
+- | cmplw cr5, CARG2, CARG3
++ | cmplw cr5, CARG2, CARG4
+ | crandc 4*cr0+gt, 4*cr0+eq, 4*cr1+gt // 2: Same type and primitive.
+ | crorc 4*cr0+lt, 4*cr5+eq, 4*cr0+eq // 1: Same tv or different type.
+ | crand 4*cr0+eq, 4*cr0+eq, 4*cr5+eq // 0: Same type and same tv.
+- | mr SAVE0, PC
++ | mr SAVE1, PC
+ | cror 4*cr0+eq, 4*cr0+eq, 4*cr0+gt // 0 or 2.
+ | cror 4*cr0+lt, 4*cr0+lt, 4*cr0+gt // 1 or 2.
+ if (vk) {
+ | bne cr0, >6
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ |6:
+ } else {
+ | beq cr0, >6
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ |6:
+ }
+ |.if DUALNUM
+@@ -3102,6 +3527,7 @@ static void build_ins(BuildCtx *ctx, BCO
+ |
+ | // Different tables or userdatas. Need to check __eq metamethod.
+ | // Field metatable must be at same offset for GCtab and GCudata!
++ | mr CARG3, CARG4
+ | lwz TAB:TMP2, TAB:CARG2->metatable
+ | li CARG4, 1-vk // ne = 0 or 1.
+ | cmplwi TAB:TMP2, 0
+@@ -3109,7 +3535,7 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lbz TMP2, TAB:TMP2->nomm
+ | andix. TMP2, TMP2, 1<<MM_eq
+ | bne <1 // Or 'no __eq' flag set?
+- | mr PC, SAVE0 // Restore old PC.
++ | mr PC, SAVE1 // Restore old PC.
+ | b ->vmeta_equal // Handle __eq metamethod.
+ break;
+
+@@ -3150,16 +3576,16 @@ static void build_ins(BuildCtx *ctx, BCO
+ vk = op == BC_ISEQN;
+ | // RA = src*8, RD = num_const*8, JMP with RD = target
+ |.if DUALNUM
+- | lwzux TMP0, RA, BASE
++ | lwzux CARG1, RA, BASE
+ | addi PC, PC, 4
+ | lwz CARG2, 4(RA)
+- | lwzux TMP1, RD, KBASE
+- | checknum cr0, TMP0
+- | lwz TMP2, -4(PC)
+- | checknum cr1, TMP1
+- | decode_RD4 TMP2, TMP2
+- | lwz CARG3, 4(RD)
+- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16)
++ | lwzux CARG3, RD, KBASE
++ | checknum cr0, CARG1
++ | lwz SAVE0, -4(PC)
++ | checknum cr1, CARG3
++ | decode_RD4 SAVE0, SAVE0
++ | lwz CARG4, 4(RD)
++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16)
+ if (vk) {
+ |->BC_ISEQN_Z:
+ } else {
+@@ -3167,7 +3593,7 @@ static void build_ins(BuildCtx *ctx, BCO
+ }
+ | bne cr0, >7
+ | bne cr1, >8
+- | cmpw CARG2, CARG3
++ | cmpw CARG2, CARG4
+ |4:
+ |.else
+ if (vk) {
+@@ -3175,20 +3601,20 @@ static void build_ins(BuildCtx *ctx, BCO
+ } else {
+ |->BC_ISNEN_Z: // Dummy label.
+ }
+- | lwzx TMP0, BASE, RA
++ | lwzx CARG1, BASE, RA
+ | addi PC, PC, 4
+ | lfdx f0, BASE, RA
+- | lwz TMP2, -4(PC)
++ | lwz SAVE0, -4(PC)
+ | lfdx f1, KBASE, RD
+- | decode_RD4 TMP2, TMP2
+- | checknum TMP0
+- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16)
++ | decode_RD4 SAVE0, SAVE0
++ | checknum CARG1
++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16)
+ | bge >3
+ | fcmpu cr0, f0, f1
+ |.endif
+ if (vk) {
+ | bne >1
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ |1:
+ |.if not FFI
+ |3:
+@@ -3199,13 +3625,13 @@ static void build_ins(BuildCtx *ctx, BCO
+ |.if not FFI
+ |3:
+ |.endif
+- | add PC, PC, TMP2
++ | add PC, PC, SAVE0
+ |2:
+ }
+ | ins_next
+ |.if FFI
+ |3:
+- | cmpwi TMP0, LJ_TCDATA
++ | cmpwi CARG1, LJ_TCDATA
+ | beq ->vmeta_equal_cd
+ | b <1
+ |.endif
+@@ -3213,18 +3639,31 @@ static void build_ins(BuildCtx *ctx, BCO
+ |7: // RA is not an integer.
+ | bge cr0, <3
+ | // RA is a number.
+- | lfd f0, 0(RA)
++ | .FPU lfd f0, 0(RA)
+ | blt cr1, >1
+ | // RA is a number, RD is an integer.
+- | tonum_i f1, CARG3
++ |.if FPU
++ | tonum_i f1, CARG4
++ |.else
++ | bl ->vm_sfi2d_2
++ |.endif
+ | b >2
+ |
+ |8: // RA is an integer, RD is a number.
++ |.if FPU
+ | tonum_i f0, CARG2
++ |.else
++ | bl ->vm_sfi2d_1
++ |.endif
+ |1:
+- | lfd f1, 0(RD)
++ | .FPU lfd f1, 0(RD)
+ |2:
++ |.if FPU
+ | fcmpu cr0, f0, f1
++ |.else
++ | blex __ledf2
++ | cmpwi CRET1, 0
++ |.endif
+ | b <4
+ |.endif
+ break;
+@@ -3279,7 +3718,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ | add PC, PC, TMP2
+ } else {
+ | li TMP1, LJ_TFALSE
++ |.if FPU
+ | lfdx f0, BASE, RD
++ |.else
++ | lwzux CARG1, RD, BASE
++ | lwz CARG2, 4(RD)
++ |.endif
+ | cmplw TMP0, TMP1
+ if (op == BC_ISTC) {
+ | bge >1
+@@ -3288,7 +3732,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ }
+ | addis PC, PC, -(BCBIAS_J*4 >> 16)
+ | decode_RD4 TMP2, INS
++ |.if FPU
+ | stfdx f0, BASE, RA
++ |.else
++ | stwux CARG1, RA, BASE
++ | stw CARG2, 4(RA)
++ |.endif
+ | add PC, PC, TMP2
+ |1:
+ }
+@@ -3323,8 +3772,15 @@ static void build_ins(BuildCtx *ctx, BCO
+ case BC_MOV:
+ | // RA = dst*8, RD = src*8
+ | ins_next1
++ |.if FPU
+ | lfdx f0, BASE, RD
+ | stfdx f0, BASE, RA
++ |.else
++ | lwzux TMP0, RD, BASE
++ | lwz TMP1, 4(RD)
++ | stwux TMP0, RA, BASE
++ | stw TMP1, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+ case BC_NOT:
+@@ -3426,44 +3882,65 @@ static void build_ins(BuildCtx *ctx, BCO
+ ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+ ||switch (vk) {
+ ||case 0:
+- | lwzx TMP1, BASE, RB
++ | lwzx CARG1, BASE, RB
+ | .if DUALNUM
+- | lwzx TMP2, KBASE, RC
++ | lwzx CARG3, KBASE, RC
+ | .endif
++ | .if FPU
+ | lfdx f14, BASE, RB
+ | lfdx f15, KBASE, RC
++ | .else
++ | add TMP1, BASE, RB
++ | add TMP2, KBASE, RC
++ | lwz CARG2, 4(TMP1)
++ | lwz CARG4, 4(TMP2)
++ | .endif
+ | .if DUALNUM
+- | checknum cr0, TMP1
+- | checknum cr1, TMP2
++ | checknum cr0, CARG1
++ | checknum cr1, CARG3
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | bge ->vmeta_arith_vn
+ | .else
+- | checknum TMP1; bge ->vmeta_arith_vn
++ | checknum CARG1; bge ->vmeta_arith_vn
+ | .endif
+ || break;
+ ||case 1:
+- | lwzx TMP1, BASE, RB
++ | lwzx CARG1, BASE, RB
+ | .if DUALNUM
+- | lwzx TMP2, KBASE, RC
++ | lwzx CARG3, KBASE, RC
+ | .endif
++ | .if FPU
+ | lfdx f15, BASE, RB
+ | lfdx f14, KBASE, RC
++ | .else
++ | add TMP1, BASE, RB
++ | add TMP2, KBASE, RC
++ | lwz CARG2, 4(TMP1)
++ | lwz CARG4, 4(TMP2)
++ | .endif
+ | .if DUALNUM
+- | checknum cr0, TMP1
+- | checknum cr1, TMP2
++ | checknum cr0, CARG1
++ | checknum cr1, CARG3
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | bge ->vmeta_arith_nv
+ | .else
+- | checknum TMP1; bge ->vmeta_arith_nv
++ | checknum CARG1; bge ->vmeta_arith_nv
+ | .endif
+ || break;
+ ||default:
+- | lwzx TMP1, BASE, RB
+- | lwzx TMP2, BASE, RC
++ | lwzx CARG1, BASE, RB
++ | lwzx CARG3, BASE, RC
++ | .if FPU
+ | lfdx f14, BASE, RB
+ | lfdx f15, BASE, RC
+- | checknum cr0, TMP1
+- | checknum cr1, TMP2
++ | .else
++ | add TMP1, BASE, RB
++ | add TMP2, BASE, RC
++ | lwz CARG2, 4(TMP1)
++ | lwz CARG4, 4(TMP2)
++ | .endif
++ | checknum cr0, CARG1
++ | checknum cr1, CARG3
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | bge ->vmeta_arith_vv
+ || break;
+@@ -3497,48 +3974,78 @@ static void build_ins(BuildCtx *ctx, BCO
+ | fsub a, b, a // b - floor(b/c)*c
+ |.endmacro
+ |
++ |.macro sfpmod
++ |->BC_MODVN_Z:
++ | stw CARG1, SFSAVE_1
++ | stw CARG2, SFSAVE_2
++ | mr SAVE0, CARG3
++ | mr SAVE1, CARG4
++ | blex __divdf3
++ | blex floor
++ | mr CARG3, SAVE0
++ | mr CARG4, SAVE1
++ | blex __muldf3
++ | mr CARG3, CRET1
++ | mr CARG4, CRET2
++ | lwz CARG1, SFSAVE_1
++ | lwz CARG2, SFSAVE_2
++ | blex __subdf3
++ |.endmacro
++ |
+ |.macro ins_arithfp, fpins
+ | ins_arithpre
+ |.if "fpins" == "fpmod_"
+ | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
+- |.else
++ |.elif FPU
+ | fpins f0, f14, f15
+ | ins_next1
+ | stfdx f0, BASE, RA
+ | ins_next2
++ |.else
++ | blex __divdf3 // Only soft-float div uses this macro.
++ | ins_next1
++ | stwux CRET1, RA, BASE
++ | stw CRET2, 4(RA)
++ | ins_next2
+ |.endif
+ |.endmacro
+ |
+- |.macro ins_arithdn, intins, fpins
++ |.macro ins_arithdn, intins, fpins, fpcall
+ | // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8
+ ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+ ||switch (vk) {
+ ||case 0:
+- | lwzux TMP1, RB, BASE
+- | lwzux TMP2, RC, KBASE
+- | lwz CARG1, 4(RB)
+- | checknum cr0, TMP1
+- | lwz CARG2, 4(RC)
++ | lwzux CARG1, RB, BASE
++ | lwzux CARG3, RC, KBASE
++ | lwz CARG2, 4(RB)
++ | checknum cr0, CARG1
++ | lwz CARG4, 4(RC)
++ | checknum cr1, CARG3
+ || break;
+ ||case 1:
+- | lwzux TMP1, RB, BASE
+- | lwzux TMP2, RC, KBASE
+- | lwz CARG2, 4(RB)
+- | checknum cr0, TMP1
+- | lwz CARG1, 4(RC)
++ | lwzux CARG3, RB, BASE
++ | lwzux CARG1, RC, KBASE
++ | lwz CARG4, 4(RB)
++ | checknum cr0, CARG3
++ | lwz CARG2, 4(RC)
++ | checknum cr1, CARG1
+ || break;
+ ||default:
+- | lwzux TMP1, RB, BASE
+- | lwzux TMP2, RC, BASE
+- | lwz CARG1, 4(RB)
+- | checknum cr0, TMP1
+- | lwz CARG2, 4(RC)
++ | lwzux CARG1, RB, BASE
++ | lwzux CARG3, RC, BASE
++ | lwz CARG2, 4(RB)
++ | checknum cr0, CARG1
++ | lwz CARG4, 4(RC)
++ | checknum cr1, CARG3
+ || break;
+ ||}
+- | checknum cr1, TMP2
+ | bne >5
+ | bne cr1, >5
+- | intins CARG1, CARG1, CARG2
++ |.if "intins" == "intmod"
++ | mr CARG1, CARG2
++ | mr CARG2, CARG4
++ |.endif
++ | intins CARG1, CARG2, CARG4
+ | bso >4
+ |1:
+ | ins_next1
+@@ -3550,29 +4057,40 @@ static void build_ins(BuildCtx *ctx, BCO
+ | checkov TMP0, <1 // Ignore unrelated overflow.
+ | ins_arithfallback b
+ |5: // FP variant.
++ |.if FPU
+ ||if (vk == 1) {
+ | lfd f15, 0(RB)
+- | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | lfd f14, 0(RC)
+ ||} else {
+ | lfd f14, 0(RB)
+- | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | lfd f15, 0(RC)
+ ||}
++ |.endif
++ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | ins_arithfallback bge
+ |.if "fpins" == "fpmod_"
+ | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
+ |.else
++ |.if FPU
+ | fpins f0, f14, f15
+- | ins_next1
+ | stfdx f0, BASE, RA
++ |.else
++ |.if "fpcall" == "sfpmod"
++ | sfpmod
++ |.else
++ | blex fpcall
++ |.endif
++ | stwux CRET1, RA, BASE
++ | stw CRET2, 4(RA)
++ |.endif
++ | ins_next1
+ | b <2
+ |.endif
+ |.endmacro
+ |
+- |.macro ins_arith, intins, fpins
++ |.macro ins_arith, intins, fpins, fpcall
+ |.if DUALNUM
+- | ins_arithdn intins, fpins
++ | ins_arithdn intins, fpins, fpcall
+ |.else
+ | ins_arithfp fpins
+ |.endif
+@@ -3587,9 +4105,9 @@ static void build_ins(BuildCtx *ctx, BCO
+ | addo. TMP0, TMP0, TMP3
+ | add y, a, b
+ |.endmacro
+- | ins_arith addo32., fadd
++ | ins_arith addo32., fadd, __adddf3
+ |.else
+- | ins_arith addo., fadd
++ | ins_arith addo., fadd, __adddf3
+ |.endif
+ break;
+ case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
+@@ -3601,36 +4119,48 @@ static void build_ins(BuildCtx *ctx, BCO
+ | subo. TMP0, TMP0, TMP3
+ | sub y, a, b
+ |.endmacro
+- | ins_arith subo32., fsub
++ | ins_arith subo32., fsub, __subdf3
+ |.else
+- | ins_arith subo., fsub
++ | ins_arith subo., fsub, __subdf3
+ |.endif
+ break;
+ case BC_MULVN: case BC_MULNV: case BC_MULVV:
+- | ins_arith mullwo., fmul
++ | ins_arith mullwo., fmul, __muldf3
+ break;
+ case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
+ | ins_arithfp fdiv
+ break;
+ case BC_MODVN:
+- | ins_arith intmod, fpmod
++ | ins_arith intmod, fpmod, sfpmod
+ break;
+ case BC_MODNV: case BC_MODVV:
+- | ins_arith intmod, fpmod_
++ | ins_arith intmod, fpmod_, sfpmod
+ break;
+ case BC_POW:
+ | // NYI: (partial) integer arithmetic.
+- | lwzx TMP1, BASE, RB
++ | lwzx CARG1, BASE, RB
++ | lwzx CARG3, BASE, RC
++ |.if FPU
+ | lfdx FARG1, BASE, RB
+- | lwzx TMP2, BASE, RC
+ | lfdx FARG2, BASE, RC
+- | checknum cr0, TMP1
+- | checknum cr1, TMP2
++ |.else
++ | add TMP1, BASE, RB
++ | add TMP2, BASE, RC
++ | lwz CARG2, 4(TMP1)
++ | lwz CARG4, 4(TMP2)
++ |.endif
++ | checknum cr0, CARG1
++ | checknum cr1, CARG3
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
+ | bge ->vmeta_arith_vv
+ | blex pow
+ | ins_next1
++ |.if FPU
+ | stfdx FARG1, BASE, RA
++ |.else
++ | stwux CARG1, RA, BASE
++ | stw CARG2, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+
+@@ -3650,8 +4180,15 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lp BASE, L->base
+ | bne ->vmeta_binop
+ | ins_next1
++ |.if FPU
+ | lfdx f0, BASE, SAVE0 // Copy result from RB to RA.
+ | stfdx f0, BASE, RA
++ |.else
++ | lwzux TMP0, SAVE0, BASE
++ | lwz TMP1, 4(SAVE0)
++ | stwux TMP0, RA, BASE
++ | stw TMP1, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+
+@@ -3714,8 +4251,15 @@ static void build_ins(BuildCtx *ctx, BCO
+ case BC_KNUM:
+ | // RA = dst*8, RD = num_const*8
+ | ins_next1
++ |.if FPU
+ | lfdx f0, KBASE, RD
+ | stfdx f0, BASE, RA
++ |.else
++ | lwzux TMP0, RD, KBASE
++ | lwz TMP1, 4(RD)
++ | stwux TMP0, RA, BASE
++ | stw TMP1, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+ case BC_KPRI:
+@@ -3748,8 +4292,15 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwzx UPVAL:RB, LFUNC:RB, RD
+ | ins_next1
+ | lwz TMP1, UPVAL:RB->v
++ |.if FPU
+ | lfd f0, 0(TMP1)
+ | stfdx f0, BASE, RA
++ |.else
++ | lwz TMP2, 0(TMP1)
++ | lwz TMP3, 4(TMP1)
++ | stwux TMP2, RA, BASE
++ | stw TMP3, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+ case BC_USETV:
+@@ -3757,14 +4308,24 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwz LFUNC:RB, FRAME_FUNC(BASE)
+ | srwi RA, RA, 1
+ | addi RA, RA, offsetof(GCfuncL, uvptr)
++ |.if FPU
+ | lfdux f0, RD, BASE
++ |.else
++ | lwzux CARG1, RD, BASE
++ | lwz CARG3, 4(RD)
++ |.endif
+ | lwzx UPVAL:RB, LFUNC:RB, RA
+ | lbz TMP3, UPVAL:RB->marked
+ | lwz CARG2, UPVAL:RB->v
+ | andix. TMP3, TMP3, LJ_GC_BLACK // isblack(uv)
+ | lbz TMP0, UPVAL:RB->closed
+ | lwz TMP2, 0(RD)
++ |.if FPU
+ | stfd f0, 0(CARG2)
++ |.else
++ | stw CARG1, 0(CARG2)
++ | stw CARG3, 4(CARG2)
++ |.endif
+ | cmplwi cr1, TMP0, 0
+ | lwz TMP1, 4(RD)
+ | cror 4*cr0+eq, 4*cr0+eq, 4*cr1+eq
+@@ -3820,11 +4381,21 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwz LFUNC:RB, FRAME_FUNC(BASE)
+ | srwi RA, RA, 1
+ | addi RA, RA, offsetof(GCfuncL, uvptr)
++ |.if FPU
+ | lfdx f0, KBASE, RD
++ |.else
++ | lwzux TMP2, RD, KBASE
++ | lwz TMP3, 4(RD)
++ |.endif
+ | lwzx UPVAL:RB, LFUNC:RB, RA
+ | ins_next1
+ | lwz TMP1, UPVAL:RB->v
++ |.if FPU
+ | stfd f0, 0(TMP1)
++ |.else
++ | stw TMP2, 0(TMP1)
++ | stw TMP3, 4(TMP1)
++ |.endif
+ | ins_next2
+ break;
+ case BC_USETP:
+@@ -3972,11 +4543,21 @@ static void build_ins(BuildCtx *ctx, BCO
+ |.endif
+ | ble ->vmeta_tgetv // Integer key and in array part?
+ | lwzx TMP0, TMP1, TMP2
++ |.if FPU
+ | lfdx f14, TMP1, TMP2
++ |.else
++ | lwzux SAVE0, TMP1, TMP2
++ | lwz SAVE1, 4(TMP1)
++ |.endif
+ | checknil TMP0; beq >2
+ |1:
+ | ins_next1
++ |.if FPU
+ | stfdx f14, BASE, RA
++ |.else
++ | stwux SAVE0, RA, BASE
++ | stw SAVE1, 4(RA)
++ |.endif
+ | ins_next2
+ |
+ |2: // Check for __index if table value is nil.
+@@ -4052,12 +4633,22 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwz TMP1, TAB:RB->asize
+ | lwz TMP2, TAB:RB->array
+ | cmplw TMP0, TMP1; bge ->vmeta_tgetb
++ |.if FPU
+ | lwzx TMP1, TMP2, RC
+ | lfdx f0, TMP2, RC
++ |.else
++ | lwzux TMP1, TMP2, RC
++ | lwz TMP3, 4(TMP2)
++ |.endif
+ | checknil TMP1; beq >5
+ |1:
+ | ins_next1
++ |.if FPU
+ | stfdx f0, BASE, RA
++ |.else
++ | stwux TMP1, RA, BASE
++ | stw TMP3, 4(RA)
++ |.endif
+ | ins_next2
+ |
+ |5: // Check for __index if table value is nil.
+@@ -4087,10 +4678,20 @@ static void build_ins(BuildCtx *ctx, BCO
+ | cmplw TMP0, CARG2
+ | slwi TMP2, CARG2, 3
+ | ble ->vmeta_tgetr // In array part?
++ |.if FPU
+ | lfdx f14, TMP1, TMP2
++ |.else
++ | lwzux SAVE0, TMP2, TMP1
++ | lwz SAVE1, 4(TMP2)
++ |.endif
+ |->BC_TGETR_Z:
+ | ins_next1
++ |.if FPU
+ | stfdx f14, BASE, RA
++ |.else
++ | stwux SAVE0, RA, BASE
++ | stw SAVE1, 4(RA)
++ |.endif
+ | ins_next2
+ break;
+
+@@ -4131,11 +4732,22 @@ static void build_ins(BuildCtx *ctx, BCO
+ | ble ->vmeta_tsetv // Integer key and in array part?
+ | lwzx TMP2, TMP1, TMP0
+ | lbz TMP3, TAB:RB->marked
++ |.if FPU
+ | lfdx f14, BASE, RA
++ |.else
++ | add SAVE1, BASE, RA
++ | lwz SAVE0, 0(SAVE1)
++ | lwz SAVE1, 4(SAVE1)
++ |.endif
+ | checknil TMP2; beq >3
+ |1:
+ | andix. TMP2, TMP3, LJ_GC_BLACK // isblack(table)
++ |.if FPU
+ | stfdx f14, TMP1, TMP0
++ |.else
++ | stwux SAVE0, TMP1, TMP0
++ | stw SAVE1, 4(TMP1)
++ |.endif
+ | bne >7
+ |2:
+ | ins_next
+@@ -4176,7 +4788,13 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwz NODE:TMP2, TAB:RB->node
+ | stb ZERO, TAB:RB->nomm // Clear metamethod cache.
+ | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask
++ |.if FPU
+ | lfdx f14, BASE, RA
++ |.else
++ | add CARG2, BASE, RA
++ | lwz SAVE0, 0(CARG2)
++ | lwz SAVE1, 4(CARG2)
++ |.endif
+ | slwi TMP0, TMP1, 5
+ | slwi TMP1, TMP1, 3
+ | sub TMP1, TMP0, TMP1
+@@ -4192,7 +4810,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ | checknil CARG2; beq >4 // Key found, but nil value?
+ |2:
+ | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table)
++ |.if FPU
+ | stfd f14, NODE:TMP2->val
++ |.else
++ | stw SAVE0, NODE:TMP2->val.u32.hi
++ | stw SAVE1, NODE:TMP2->val.u32.lo
++ |.endif
+ | bne >7
+ |3:
+ | ins_next
+@@ -4231,7 +4854,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ | bl extern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k)
+ | // Returns TValue *.
+ | lp BASE, L->base
++ |.if FPU
+ | stfd f14, 0(CRET1)
++ |.else
++ | stw SAVE0, 0(CRET1)
++ | stw SAVE1, 4(CRET1)
++ |.endif
+ | b <3 // No 2nd write barrier needed.
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+@@ -4248,13 +4876,24 @@ static void build_ins(BuildCtx *ctx, BCO
+ | lwz TMP2, TAB:RB->array
+ | lbz TMP3, TAB:RB->marked
+ | cmplw TMP0, TMP1
++ |.if FPU
+ | lfdx f14, BASE, RA
++ |.else
++ | add CARG2, BASE, RA
++ | lwz SAVE0, 0(CARG2)
++ | lwz SAVE1, 4(CARG2)
++ |.endif
+ | bge ->vmeta_tsetb
+ | lwzx TMP1, TMP2, RC
+ | checknil TMP1; beq >5
+ |1:
+ | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table)
++ |.if FPU
+ | stfdx f14, TMP2, RC
++ |.else
++ | stwux SAVE0, RC, TMP2
++ | stw SAVE1, 4(RC)
++ |.endif
+ | bne >7
+ |2:
+ | ins_next
+@@ -4294,10 +4933,20 @@ static void build_ins(BuildCtx *ctx, BCO
+ |2:
+ | cmplw TMP0, CARG3
+ | slwi TMP2, CARG3, 3
++ |.if FPU
+ | lfdx f14, BASE, RA
++ |.else
++ | lwzux SAVE0, RA, BASE
++ | lwz SAVE1, 4(RA)
++ |.endif
+ | ble ->vmeta_tsetr // In array part?
+ | ins_next1
++ |.if FPU
+ | stfdx f14, TMP1, TMP2
++ |.else
++ | stwux SAVE0, TMP1, TMP2
++ | stw SAVE1, 4(TMP1)
++ |.endif
+ | ins_next2
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+@@ -4327,10 +4976,20 @@ static void build_ins(BuildCtx *ctx, BCO
+ | add TMP1, TMP1, TMP0
+ | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table)
+ |3: // Copy result slots to table.
++ |.if FPU
+ | lfd f0, 0(RA)
++ |.else
++ | lwz SAVE0, 0(RA)
++ | lwz SAVE1, 4(RA)
++ |.endif
+ | addi RA, RA, 8
+ | cmpw cr1, RA, TMP2
++ |.if FPU
+ | stfd f0, 0(TMP1)
++ |.else
++ | stw SAVE0, 0(TMP1)
++ | stw SAVE1, 4(TMP1)
++ |.endif
+ | addi TMP1, TMP1, 8
+ | blt cr1, <3
+ | bne >7
+@@ -4397,9 +5056,20 @@ static void build_ins(BuildCtx *ctx, BCO
+ | beq cr1, >3
+ |2:
+ | addi TMP3, TMP2, 8
++ |.if FPU
+ | lfdx f0, RA, TMP2
++ |.else
++ | add CARG3, RA, TMP2
++ | lwz CARG1, 0(CARG3)
++ | lwz CARG2, 4(CARG3)
++ |.endif
+ | cmplw cr1, TMP3, NARGS8:RC
++ |.if FPU
+ | stfdx f0, BASE, TMP2
++ |.else
++ | stwux CARG1, TMP2, BASE
++ | stw CARG2, 4(TMP2)
++ |.endif
+ | mr TMP2, TMP3
+ | bne cr1, <2
+ |3:
+@@ -4432,14 +5102,28 @@ static void build_ins(BuildCtx *ctx, BCO
+ | add BASE, BASE, RA
+ | lwz TMP1, -24(BASE)
+ | lwz LFUNC:RB, -20(BASE)
++ |.if FPU
+ | lfd f1, -8(BASE)
+ | lfd f0, -16(BASE)
++ |.else
++ | lwz CARG1, -8(BASE)
++ | lwz CARG2, -4(BASE)
++ | lwz CARG3, -16(BASE)
++ | lwz CARG4, -12(BASE)
++ |.endif
+ | stw TMP1, 0(BASE) // Copy callable.
+ | stw LFUNC:RB, 4(BASE)
+ | checkfunc TMP1
+- | stfd f1, 16(BASE) // Copy control var.
+ | li NARGS8:RC, 16 // Iterators get 2 arguments.
++ |.if FPU
++ | stfd f1, 16(BASE) // Copy control var.
+ | stfdu f0, 8(BASE) // Copy state.
++ |.else
++ | stw CARG1, 16(BASE) // Copy control var.
++ | stw CARG2, 20(BASE)
++ | stwu CARG3, 8(BASE) // Copy state.
++ | stw CARG4, 4(BASE)
++ |.endif
+ | bne ->vmeta_call
+ | ins_call
+ break;
+@@ -4460,7 +5144,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ | slwi TMP3, RC, 3
+ | bge >5 // Index points after array part?
+ | lwzx TMP2, TMP1, TMP3
++ |.if FPU
+ | lfdx f0, TMP1, TMP3
++ |.else
++ | lwzux CARG1, TMP3, TMP1
++ | lwz CARG2, 4(TMP3)
++ |.endif
+ | checknil TMP2
+ | lwz INS, -4(PC)
+ | beq >4
+@@ -4472,7 +5161,12 @@ static void build_ins(BuildCtx *ctx, BCO
+ |.endif
+ | addi RC, RC, 1
+ | addis TMP3, PC, -(BCBIAS_J*4 >> 16)
++ |.if FPU
+ | stfd f0, 8(RA)
++ |.else
++ | stw CARG1, 8(RA)
++ | stw CARG2, 12(RA)
++ |.endif
+ | decode_RD4 TMP1, INS
+ | stw RC, -4(RA) // Update control var.
+ | add PC, TMP1, TMP3
+@@ -4497,17 +5191,38 @@ static void build_ins(BuildCtx *ctx, BCO
+ | slwi RB, RC, 3
+ | sub TMP3, TMP3, RB
+ | lwzx RB, TMP2, TMP3
++ |.if FPU
+ | lfdx f0, TMP2, TMP3
++ |.else
++ | add CARG3, TMP2, TMP3
++ | lwz CARG1, 0(CARG3)
++ | lwz CARG2, 4(CARG3)
++ |.endif
+ | add NODE:TMP3, TMP2, TMP3
+ | checknil RB
+ | lwz INS, -4(PC)
+ | beq >7
++ |.if FPU
+ | lfd f1, NODE:TMP3->key
++ |.else
++ | lwz CARG3, NODE:TMP3->key.u32.hi
++ | lwz CARG4, NODE:TMP3->key.u32.lo
++ |.endif
+ | addis TMP2, PC, -(BCBIAS_J*4 >> 16)
++ |.if FPU
+ | stfd f0, 8(RA)
++ |.else
++ | stw CARG1, 8(RA)
++ | stw CARG2, 12(RA)
++ |.endif
+ | add RC, RC, TMP0
+ | decode_RD4 TMP1, INS
++ |.if FPU
+ | stfd f1, 0(RA)
++ |.else
++ | stw CARG3, 0(RA)
++ | stw CARG4, 4(RA)
++ |.endif
+ | addi RC, RC, 1
+ | add PC, TMP1, TMP2
+ | stw RC, -4(RA) // Update control var.
+@@ -4573,9 +5288,19 @@ static void build_ins(BuildCtx *ctx, BCO
+ | subi TMP2, TMP2, 16
+ | ble >2 // No vararg slots?
+ |1: // Copy vararg slots to destination slots.
++ |.if FPU
+ | lfd f0, 0(RC)
++ |.else
++ | lwz CARG1, 0(RC)
++ | lwz CARG2, 4(RC)
++ |.endif
+ | addi RC, RC, 8
++ |.if FPU
+ | stfd f0, 0(RA)
++ |.else
++ | stw CARG1, 0(RA)
++ | stw CARG2, 4(RA)
++ |.endif
+ | cmplw RA, TMP2
+ | cmplw cr1, RC, TMP3
+ | bge >3 // All destination slots filled?
+@@ -4598,9 +5323,19 @@ static void build_ins(BuildCtx *ctx, BCO
+ | addi MULTRES, TMP1, 8
+ | bgt >7
+ |6:
++ |.if FPU
+ | lfd f0, 0(RC)
++ |.else
++ | lwz CARG1, 0(RC)
++ | lwz CARG2, 4(RC)
++ |.endif
+ | addi RC, RC, 8
++ |.if FPU
+ | stfd f0, 0(RA)
++ |.else
++ | stw CARG1, 0(RA)
++ | stw CARG2, 4(RA)
++ |.endif
+ | cmplw RC, TMP3
+ | addi RA, RA, 8
+ | blt <6 // More vararg slots?
+@@ -4651,14 +5386,38 @@ static void build_ins(BuildCtx *ctx, BCO
+ | li TMP1, 0
+ |2:
+ | addi TMP3, TMP1, 8
++ |.if FPU
+ | lfdx f0, RA, TMP1
++ |.else
++ | add CARG3, RA, TMP1
++ | lwz CARG1, 0(CARG3)
++ | lwz CARG2, 4(CARG3)
++ |.endif
+ | cmpw TMP3, RC
++ |.if FPU
+ | stfdx f0, TMP2, TMP1
++ |.else
++ | add CARG3, TMP2, TMP1
++ | stw CARG1, 0(CARG3)
++ | stw CARG2, 4(CARG3)
++ |.endif
+ | beq >3
+ | addi TMP1, TMP3, 8
++ |.if FPU
+ | lfdx f1, RA, TMP3
++ |.else
++ | add CARG3, RA, TMP3
++ | lwz CARG1, 0(CARG3)
++ | lwz CARG2, 4(CARG3)
++ |.endif
+ | cmpw TMP1, RC
++ |.if FPU
+ | stfdx f1, TMP2, TMP3
++ |.else
++ | add CARG3, TMP2, TMP3
++ | stw CARG1, 0(CARG3)
++ | stw CARG2, 4(CARG3)
++ |.endif
+ | bne <2
+ |3:
+ |5:
+@@ -4700,8 +5459,15 @@ static void build_ins(BuildCtx *ctx, BCO
+ | subi TMP2, BASE, 8
+ | decode_RB8 RB, INS
+ if (op == BC_RET1) {
++ |.if FPU
+ | lfd f0, 0(RA)
+ | stfd f0, 0(TMP2)
++ |.else
++ | lwz CARG1, 0(RA)
++ | lwz CARG2, 4(RA)
++ | stw CARG1, 0(TMP2)
++ | stw CARG2, 4(TMP2)
++ |.endif
+ }
+ |5:
+ | cmplw RB, RD
+@@ -4762,11 +5528,11 @@ static void build_ins(BuildCtx *ctx, BCO
+ |4:
+ | stw CARG1, FORL_IDX*8+4(RA)
+ } else {
+- | lwz TMP3, FORL_STEP*8(RA)
++ | lwz SAVE0, FORL_STEP*8(RA)
+ | lwz CARG3, FORL_STEP*8+4(RA)
+ | lwz TMP2, FORL_STOP*8(RA)
+ | lwz CARG2, FORL_STOP*8+4(RA)
+- | cmplw cr7, TMP3, TISNUM
++ | cmplw cr7, SAVE0, TISNUM
+ | cmplw cr1, TMP2, TISNUM
+ | crand 4*cr0+eq, 4*cr0+eq, 4*cr7+eq
+ | crand 4*cr0+eq, 4*cr0+eq, 4*cr1+eq
+@@ -4809,41 +5575,80 @@ static void build_ins(BuildCtx *ctx, BCO
+ if (vk) {
+ |.if DUALNUM
+ |9: // FP loop.
++ |.if FPU
+ | lfd f1, FORL_IDX*8(RA)
+ |.else
++ | lwz CARG1, FORL_IDX*8(RA)
++ | lwz CARG2, FORL_IDX*8+4(RA)
++ |.endif
++ |.else
+ | lfdux f1, RA, BASE
+ |.endif
++ |.if FPU
+ | lfd f3, FORL_STEP*8(RA)
+ | lfd f2, FORL_STOP*8(RA)
+- | lwz TMP3, FORL_STEP*8(RA)
+ | fadd f1, f1, f3
+ | stfd f1, FORL_IDX*8(RA)
++ |.else
++ | lwz CARG3, FORL_STEP*8(RA)
++ | lwz CARG4, FORL_STEP*8+4(RA)
++ | mr SAVE1, RD
++ | blex __adddf3
++ | mr RD, SAVE1
++ | stw CRET1, FORL_IDX*8(RA)
++ | stw CRET2, FORL_IDX*8+4(RA)
++ | lwz CARG3, FORL_STOP*8(RA)
++ | lwz CARG4, FORL_STOP*8+4(RA)
++ |.endif
++ | lwz SAVE0, FORL_STEP*8(RA)
+ } else {
+ |.if DUALNUM
+ |9: // FP loop.
+ |.else
+ | lwzux TMP1, RA, BASE
+- | lwz TMP3, FORL_STEP*8(RA)
++ | lwz SAVE0, FORL_STEP*8(RA)
+ | lwz TMP2, FORL_STOP*8(RA)
+ | cmplw cr0, TMP1, TISNUM
+- | cmplw cr7, TMP3, TISNUM
++ | cmplw cr7, SAVE0, TISNUM
+ | cmplw cr1, TMP2, TISNUM
+ |.endif
++ |.if FPU
+ | lfd f1, FORL_IDX*8(RA)
++ |.else
++ | lwz CARG1, FORL_IDX*8(RA)
++ | lwz CARG2, FORL_IDX*8+4(RA)
++ |.endif
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr7+lt
+ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
++ |.if FPU
+ | lfd f2, FORL_STOP*8(RA)
++ |.else
++ | lwz CARG3, FORL_STOP*8(RA)
++ | lwz CARG4, FORL_STOP*8+4(RA)
++ |.endif
+ | bge ->vmeta_for
+ }
+- | cmpwi cr6, TMP3, 0
++ | cmpwi cr6, SAVE0, 0
+ if (op != BC_JFORL) {
+ | srwi RD, RD, 1
+ }
++ |.if FPU
+ | stfd f1, FORL_EXT*8(RA)
++ |.else
++ | stw CARG1, FORL_EXT*8(RA)
++ | stw CARG2, FORL_EXT*8+4(RA)
++ |.endif
+ if (op != BC_JFORL) {
+ | add RD, PC, RD
+ }
++ |.if FPU
+ | fcmpu cr0, f1, f2
++ |.else
++ | mr SAVE1, RD
++ | blex __ledf2
++ | cmpwi CRET1, 0
++ | mr RD, SAVE1
++ |.endif
+ if (op == BC_JFORI) {
+ | addis PC, RD, -(BCBIAS_J*4 >> 16)
+ }
--- /dev/null
+From 71b7bc88341945f13f3951e2bb5fd247b639ff7a Mon Sep 17 00:00:00 2001
+From: Mike Pall <mike>
+Date: Sun, 3 Sep 2017 23:20:53 +0200
+Subject: [PATCH] PPC: Add soft-float support to JIT compiler backend.
+
+Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+Sponsored by Cisco Systems, Inc.
+---
+ src/lj_arch.h | 1 -
+ src/lj_asm_ppc.h | 321 ++++++++++++++++++++++++++++++++++++++++-------
+ 2 files changed, 278 insertions(+), 44 deletions(-)
+
+--- a/src/lj_arch.h
++++ b/src/lj_arch.h
+@@ -273,7 +273,6 @@
+ #endif
+
+ #if LJ_ABI_SOFTFP
+-#define LJ_ARCH_NOJIT 1 /* NYI */
+ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL
+ #else
+ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE
+--- a/src/lj_asm_ppc.h
++++ b/src/lj_asm_ppc.h
+@@ -226,6 +226,7 @@ static void asm_fusexrefx(ASMState *as,
+ emit_tab(as, pi, rt, left, right);
+ }
+
++#if !LJ_SOFTFP
+ /* Fuse to multiply-add/sub instruction. */
+ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
+ {
+@@ -245,6 +246,7 @@ static int asm_fusemadd(ASMState *as, IR
+ }
+ return 0;
+ }
++#endif
+
+ /* -- Calls --------------------------------------------------------------- */
+
+@@ -253,13 +255,17 @@ static void asm_gencall(ASMState *as, co
+ {
+ uint32_t n, nargs = CCI_XNARGS(ci);
+ int32_t ofs = 8;
+- Reg gpr = REGARG_FIRSTGPR, fpr = REGARG_FIRSTFPR;
++ Reg gpr = REGARG_FIRSTGPR;
++#if !LJ_SOFTFP
++ Reg fpr = REGARG_FIRSTFPR;
++#endif
+ if ((void *)ci->func)
+ emit_call(as, (void *)ci->func);
+ for (n = 0; n < nargs; n++) { /* Setup args. */
+ IRRef ref = args[n];
+ if (ref) {
+ IRIns *ir = IR(ref);
++#if !LJ_SOFTFP
+ if (irt_isfp(ir->t)) {
+ if (fpr <= REGARG_LASTFPR) {
+ lua_assert(rset_test(as->freeset, fpr)); /* Already evicted. */
+@@ -271,7 +277,9 @@ static void asm_gencall(ASMState *as, co
+ emit_spstore(as, ir, r, ofs);
+ ofs += irt_isnum(ir->t) ? 8 : 4;
+ }
+- } else {
++ } else
++#endif
++ {
+ if (gpr <= REGARG_LASTGPR) {
+ lua_assert(rset_test(as->freeset, gpr)); /* Already evicted. */
+ ra_leftov(as, gpr, ref);
+@@ -290,8 +298,10 @@ static void asm_gencall(ASMState *as, co
+ }
+ checkmclim(as);
+ }
++#if !LJ_SOFTFP
+ if ((ci->flags & CCI_VARARG)) /* Vararg calls need to know about FPR use. */
+ emit_tab(as, fpr == REGARG_FIRSTFPR ? PPCI_CRXOR : PPCI_CREQV, 6, 6, 6);
++#endif
+ }
+
+ /* Setup result reg/sp for call. Evict scratch regs. */
+@@ -299,8 +309,10 @@ static void asm_setupresult(ASMState *as
+ {
+ RegSet drop = RSET_SCRATCH;
+ int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
++#if !LJ_SOFTFP
+ if ((ci->flags & CCI_NOFPRCLOBBER))
+ drop &= ~RSET_FPR;
++#endif
+ if (ra_hasreg(ir->r))
+ rset_clear(drop, ir->r); /* Dest reg handled below. */
+ if (hiop && ra_hasreg((ir+1)->r))
+@@ -308,7 +320,7 @@ static void asm_setupresult(ASMState *as
+ ra_evictset(as, drop); /* Evictions must be performed first. */
+ if (ra_used(ir)) {
+ lua_assert(!irt_ispri(ir->t));
+- if (irt_isfp(ir->t)) {
++ if (!LJ_SOFTFP && irt_isfp(ir->t)) {
+ if ((ci->flags & CCI_CASTU64)) {
+ /* Use spill slot or temp slots. */
+ int32_t ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP;
+@@ -377,6 +389,7 @@ static void asm_retf(ASMState *as, IRIns
+
+ /* -- Type conversions ---------------------------------------------------- */
+
++#if !LJ_SOFTFP
+ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+ {
+ RegSet allow = RSET_FPR;
+@@ -409,15 +422,23 @@ static void asm_tobit(ASMState *as, IRIn
+ emit_fai(as, PPCI_STFD, tmp, RID_SP, SPOFS_TMP);
+ emit_fab(as, PPCI_FADD, tmp, left, right);
+ }
++#endif
+
+ static void asm_conv(ASMState *as, IRIns *ir)
+ {
+ IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
++#if !LJ_SOFTFP
+ int stfp = (st == IRT_NUM || st == IRT_FLOAT);
++#endif
+ IRRef lref = ir->op1;
+- lua_assert(irt_type(ir->t) != st);
+ lua_assert(!(irt_isint64(ir->t) ||
+ (st == IRT_I64 || st == IRT_U64))); /* Handled by SPLIT. */
++#if LJ_SOFTFP
++ /* FP conversions are handled by SPLIT. */
++ lua_assert(!irt_isfp(ir->t) && !(st == IRT_NUM || st == IRT_FLOAT));
++ /* Can't check for same types: SPLIT uses CONV int.int + BXOR for sfp NEG. */
++#else
++ lua_assert(irt_type(ir->t) != st);
+ if (irt_isfp(ir->t)) {
+ Reg dest = ra_dest(as, ir, RSET_FPR);
+ if (stfp) { /* FP to FP conversion. */
+@@ -476,7 +497,9 @@ static void asm_conv(ASMState *as, IRIns
+ emit_fb(as, PPCI_FCTIWZ, tmp, left);
+ }
+ }
+- } else {
++ } else
++#endif
++ {
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
+ Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+@@ -496,17 +519,41 @@ static void asm_strto(ASMState *as, IRIn
+ {
+ const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
+ IRRef args[2];
+- int32_t ofs;
++ int32_t ofs = SPOFS_TMP;
++#if LJ_SOFTFP
++ ra_evictset(as, RSET_SCRATCH);
++ if (ra_used(ir)) {
++ if (ra_hasspill(ir->s) && ra_hasspill((ir+1)->s) &&
++ (ir->s & 1) == LJ_BE && (ir->s ^ 1) == (ir+1)->s) {
++ int i;
++ for (i = 0; i < 2; i++) {
++ Reg r = (ir+i)->r;
++ if (ra_hasreg(r)) {
++ ra_free(as, r);
++ ra_modified(as, r);
++ emit_spload(as, ir+i, r, sps_scale((ir+i)->s));
++ }
++ }
++ ofs = sps_scale(ir->s & ~1);
++ } else {
++ Reg rhi = ra_dest(as, ir+1, RSET_GPR);
++ Reg rlo = ra_dest(as, ir, rset_exclude(RSET_GPR, rhi));
++ emit_tai(as, PPCI_LWZ, rhi, RID_SP, ofs);
++ emit_tai(as, PPCI_LWZ, rlo, RID_SP, ofs+4);
++ }
++ }
++#else
+ RegSet drop = RSET_SCRATCH;
+ if (ra_hasreg(ir->r)) rset_set(drop, ir->r); /* Spill dest reg (if any). */
+ ra_evictset(as, drop);
++ if (ir->s) ofs = sps_scale(ir->s);
++#endif
+ asm_guardcc(as, CC_EQ);
+ emit_ai(as, PPCI_CMPWI, RID_RET, 0); /* Test return status. */
+ args[0] = ir->op1; /* GCstr *str */
+ args[1] = ASMREF_TMP1; /* TValue *n */
+ asm_gencall(as, ci, args);
+ /* Store the result to the spill slot or temp slots. */
+- ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP;
+ emit_tai(as, PPCI_ADDI, ra_releasetmp(as, ASMREF_TMP1), RID_SP, ofs);
+ }
+
+@@ -530,7 +577,10 @@ static void asm_tvptr(ASMState *as, Reg
+ Reg src = ra_alloc1(as, ref, allow);
+ emit_setgl(as, src, tmptv.gcr);
+ }
+- type = ra_allock(as, irt_toitype(ir->t), allow);
++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP)
++ type = ra_alloc1(as, ref+1, allow);
++ else
++ type = ra_allock(as, irt_toitype(ir->t), allow);
+ emit_setgl(as, type, tmptv.it);
+ }
+ }
+@@ -574,11 +624,27 @@ static void asm_href(ASMState *as, IRIns
+ Reg tisnum = RID_NONE, tmpnum = RID_NONE;
+ IRRef refkey = ir->op2;
+ IRIns *irkey = IR(refkey);
++ int isk = irref_isk(refkey);
+ IRType1 kt = irkey->t;
+ uint32_t khash;
+ MCLabel l_end, l_loop, l_next;
+
+ rset_clear(allow, tab);
++#if LJ_SOFTFP
++ if (!isk) {
++ key = ra_alloc1(as, refkey, allow);
++ rset_clear(allow, key);
++ if (irkey[1].o == IR_HIOP) {
++ if (ra_hasreg((irkey+1)->r)) {
++ tmpnum = (irkey+1)->r;
++ ra_noweak(as, tmpnum);
++ } else {
++ tmpnum = ra_allocref(as, refkey+1, allow);
++ }
++ rset_clear(allow, tmpnum);
++ }
++ }
++#else
+ if (irt_isnum(kt)) {
+ key = ra_alloc1(as, refkey, RSET_FPR);
+ tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
+@@ -588,6 +654,7 @@ static void asm_href(ASMState *as, IRIns
+ key = ra_alloc1(as, refkey, allow);
+ rset_clear(allow, key);
+ }
++#endif
+ tmp2 = ra_scratch(as, allow);
+ rset_clear(allow, tmp2);
+
+@@ -610,7 +677,7 @@ static void asm_href(ASMState *as, IRIns
+ asm_guardcc(as, CC_EQ);
+ else
+ emit_condbranch(as, PPCI_BC|PPCF_Y, CC_EQ, l_end);
+- if (irt_isnum(kt)) {
++ if (!LJ_SOFTFP && irt_isnum(kt)) {
+ emit_fab(as, PPCI_FCMPU, 0, tmpnum, key);
+ emit_condbranch(as, PPCI_BC, CC_GE, l_next);
+ emit_ab(as, PPCI_CMPLW, tmp1, tisnum);
+@@ -620,7 +687,10 @@ static void asm_href(ASMState *as, IRIns
+ emit_ab(as, PPCI_CMPW, tmp2, key);
+ emit_condbranch(as, PPCI_BC, CC_NE, l_next);
+ }
+- emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t));
++ if (LJ_SOFTFP && ra_hasreg(tmpnum))
++ emit_ab(as, PPCI_CMPW, tmp1, tmpnum);
++ else
++ emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t));
+ if (!irt_ispri(kt))
+ emit_tai(as, PPCI_LWZ, tmp2, dest, (int32_t)offsetof(Node, key.gcr));
+ }
+@@ -629,19 +699,19 @@ static void asm_href(ASMState *as, IRIns
+ (((char *)as->mcp-(char *)l_loop) & 0xffffu);
+
+ /* Load main position relative to tab->node into dest. */
+- khash = irref_isk(refkey) ? ir_khash(irkey) : 1;
++ khash = isk ? ir_khash(irkey) : 1;
+ if (khash == 0) {
+ emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node));
+ } else {
+ Reg tmphash = tmp1;
+- if (irref_isk(refkey))
++ if (isk)
+ tmphash = ra_allock(as, khash, allow);
+ emit_tab(as, PPCI_ADD, dest, dest, tmp1);
+ emit_tai(as, PPCI_MULLI, tmp1, tmp1, sizeof(Node));
+ emit_asb(as, PPCI_AND, tmp1, tmp2, tmphash);
+ emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node));
+ emit_tai(as, PPCI_LWZ, tmp2, tab, (int32_t)offsetof(GCtab, hmask));
+- if (irref_isk(refkey)) {
++ if (isk) {
+ /* Nothing to do. */
+ } else if (irt_isstr(kt)) {
+ emit_tai(as, PPCI_LWZ, tmp1, key, (int32_t)offsetof(GCstr, hash));
+@@ -651,13 +721,19 @@ static void asm_href(ASMState *as, IRIns
+ emit_asb(as, PPCI_XOR, tmp1, tmp1, tmp2);
+ emit_rotlwi(as, tmp1, tmp1, (HASH_ROT2+HASH_ROT1)&31);
+ emit_tab(as, PPCI_SUBF, tmp2, dest, tmp2);
+- if (irt_isnum(kt)) {
++ if (LJ_SOFTFP ? (irkey[1].o == IR_HIOP) : irt_isnum(kt)) {
++#if LJ_SOFTFP
++ emit_asb(as, PPCI_XOR, tmp2, key, tmp1);
++ emit_rotlwi(as, dest, tmp1, HASH_ROT1);
++ emit_tab(as, PPCI_ADD, tmp1, tmpnum, tmpnum);
++#else
+ int32_t ofs = ra_spill(as, irkey);
+ emit_asb(as, PPCI_XOR, tmp2, tmp2, tmp1);
+ emit_rotlwi(as, dest, tmp1, HASH_ROT1);
+ emit_tab(as, PPCI_ADD, tmp1, tmp1, tmp1);
+ emit_tai(as, PPCI_LWZ, tmp2, RID_SP, ofs+4);
+ emit_tai(as, PPCI_LWZ, tmp1, RID_SP, ofs);
++#endif
+ } else {
+ emit_asb(as, PPCI_XOR, tmp2, key, tmp1);
+ emit_rotlwi(as, dest, tmp1, HASH_ROT1);
+@@ -784,8 +860,8 @@ static PPCIns asm_fxloadins(IRIns *ir)
+ case IRT_U8: return PPCI_LBZ;
+ case IRT_I16: return PPCI_LHA;
+ case IRT_U16: return PPCI_LHZ;
+- case IRT_NUM: return PPCI_LFD;
+- case IRT_FLOAT: return PPCI_LFS;
++ case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_LFD;
++ case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_LFS;
+ default: return PPCI_LWZ;
+ }
+ }
+@@ -795,8 +871,8 @@ static PPCIns asm_fxstoreins(IRIns *ir)
+ switch (irt_type(ir->t)) {
+ case IRT_I8: case IRT_U8: return PPCI_STB;
+ case IRT_I16: case IRT_U16: return PPCI_STH;
+- case IRT_NUM: return PPCI_STFD;
+- case IRT_FLOAT: return PPCI_STFS;
++ case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_STFD;
++ case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_STFS;
+ default: return PPCI_STW;
+ }
+ }
+@@ -839,7 +915,8 @@ static void asm_fstore(ASMState *as, IRI
+
+ static void asm_xload(ASMState *as, IRIns *ir)
+ {
+- Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++ Reg dest = ra_dest(as, ir,
++ (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR);
+ lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
+ if (irt_isi8(ir->t))
+ emit_as(as, PPCI_EXTSB, dest, dest);
+@@ -857,7 +934,8 @@ static void asm_xstore_(ASMState *as, IR
+ Reg src = ra_alloc1(as, irb->op1, RSET_GPR);
+ asm_fusexrefx(as, PPCI_STWBRX, src, ir->op1, rset_exclude(RSET_GPR, src));
+ } else {
+- Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++ Reg src = ra_alloc1(as, ir->op2,
++ (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR);
+ asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+ rset_exclude(RSET_GPR, src), ofs);
+ }
+@@ -871,10 +949,19 @@ static void asm_ahuvload(ASMState *as, I
+ Reg dest = RID_NONE, type = RID_TMP, tmp = RID_TMP, idx;
+ RegSet allow = RSET_GPR;
+ int32_t ofs = AHUREF_LSX;
++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) {
++ t.irt = IRT_NUM;
++ if (ra_used(ir+1)) {
++ type = ra_dest(as, ir+1, allow);
++ rset_clear(allow, type);
++ }
++ ofs = 0;
++ }
+ if (ra_used(ir)) {
+- lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t));
+- if (!irt_isnum(t)) ofs = 0;
+- dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR);
++ lua_assert((LJ_SOFTFP ? 0 : irt_isnum(ir->t)) ||
++ irt_isint(ir->t) || irt_isaddr(ir->t));
++ if (LJ_SOFTFP || !irt_isnum(t)) ofs = 0;
++ dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow);
+ rset_clear(allow, dest);
+ }
+ idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
+@@ -883,12 +970,13 @@ static void asm_ahuvload(ASMState *as, I
+ asm_guardcc(as, CC_GE);
+ emit_ab(as, PPCI_CMPLW, type, tisnum);
+ if (ra_hasreg(dest)) {
+- if (ofs == AHUREF_LSX) {
++ if (!LJ_SOFTFP && ofs == AHUREF_LSX) {
+ tmp = ra_scratch(as, rset_exclude(rset_exclude(RSET_GPR,
+ (idx&255)), (idx>>8)));
+ emit_fab(as, PPCI_LFDX, dest, (idx&255), tmp);
+ } else {
+- emit_fai(as, PPCI_LFD, dest, idx, ofs);
++ emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest, idx,
++ ofs+4*LJ_SOFTFP);
+ }
+ }
+ } else {
+@@ -911,7 +999,7 @@ static void asm_ahustore(ASMState *as, I
+ int32_t ofs = AHUREF_LSX;
+ if (ir->r == RID_SINK)
+ return;
+- if (irt_isnum(ir->t)) {
++ if (!LJ_SOFTFP && irt_isnum(ir->t)) {
+ src = ra_alloc1(as, ir->op2, RSET_FPR);
+ } else {
+ if (!irt_ispri(ir->t)) {
+@@ -919,11 +1007,14 @@ static void asm_ahustore(ASMState *as, I
+ rset_clear(allow, src);
+ ofs = 0;
+ }
+- type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP)
++ type = ra_alloc1(as, (ir+1)->op2, allow);
++ else
++ type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+ rset_clear(allow, type);
+ }
+ idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
+- if (irt_isnum(ir->t)) {
++ if (!LJ_SOFTFP && irt_isnum(ir->t)) {
+ if (ofs == AHUREF_LSX) {
+ emit_fab(as, PPCI_STFDX, src, (idx&255), RID_TMP);
+ emit_slwi(as, RID_TMP, (idx>>8), 3);
+@@ -948,21 +1039,33 @@ static void asm_sload(ASMState *as, IRIn
+ IRType1 t = ir->t;
+ Reg dest = RID_NONE, type = RID_NONE, base;
+ RegSet allow = RSET_GPR;
++ int hiop = (LJ_SOFTFP && (ir+1)->o == IR_HIOP);
++ if (hiop)
++ t.irt = IRT_NUM;
+ lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */
+- lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
++ lua_assert(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK));
+ lua_assert(LJ_DUALNUM ||
+ !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)));
++#if LJ_SOFTFP
++ lua_assert(!(ir->op2 & IRSLOAD_CONVERT)); /* Handled by LJ_SOFTFP SPLIT. */
++ if (hiop && ra_used(ir+1)) {
++ type = ra_dest(as, ir+1, allow);
++ rset_clear(allow, type);
++ }
++#else
+ if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
+ dest = ra_scratch(as, RSET_FPR);
+ asm_tointg(as, ir, dest);
+ t.irt = IRT_NUM; /* Continue with a regular number type check. */
+- } else if (ra_used(ir)) {
++ } else
++#endif
++ if (ra_used(ir)) {
+ lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t));
+- dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR);
++ dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow);
+ rset_clear(allow, dest);
+ base = ra_alloc1(as, REF_BASE, allow);
+ rset_clear(allow, base);
+- if ((ir->op2 & IRSLOAD_CONVERT)) {
++ if (!LJ_SOFTFP && (ir->op2 & IRSLOAD_CONVERT)) {
+ if (irt_isint(t)) {
+ emit_tai(as, PPCI_LWZ, dest, RID_SP, SPOFS_TMPLO);
+ dest = ra_scratch(as, RSET_FPR);
+@@ -994,10 +1097,13 @@ dotypecheck:
+ if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+ Reg tisnum = ra_allock(as, (int32_t)LJ_TISNUM, allow);
+ asm_guardcc(as, CC_GE);
+- emit_ab(as, PPCI_CMPLW, RID_TMP, tisnum);
++#if !LJ_SOFTFP
+ type = RID_TMP;
++#endif
++ emit_ab(as, PPCI_CMPLW, type, tisnum);
+ }
+- if (ra_hasreg(dest)) emit_fai(as, PPCI_LFD, dest, base, ofs-4);
++ if (ra_hasreg(dest)) emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest,
++ base, ofs-(LJ_SOFTFP?0:4));
+ } else {
+ if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+ asm_guardcc(as, CC_NE);
+@@ -1119,6 +1225,7 @@ static void asm_obar(ASMState *as, IRIns
+
+ /* -- Arithmetic and logic operations ------------------------------------- */
+
++#if !LJ_SOFTFP
+ static void asm_fparith(ASMState *as, IRIns *ir, PPCIns pi)
+ {
+ Reg dest = ra_dest(as, ir, RSET_FPR);
+@@ -1146,13 +1253,17 @@ static void asm_fpmath(ASMState *as, IRI
+ else
+ asm_callid(as, ir, IRCALL_lj_vm_floor + ir->op2);
+ }
++#endif
+
+ static void asm_add(ASMState *as, IRIns *ir)
+ {
++#if !LJ_SOFTFP
+ if (irt_isnum(ir->t)) {
+ if (!asm_fusemadd(as, ir, PPCI_FMADD, PPCI_FMADD))
+ asm_fparith(as, ir, PPCI_FADD);
+- } else {
++ } else
++#endif
++ {
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+ PPCIns pi;
+@@ -1191,10 +1302,13 @@ static void asm_add(ASMState *as, IRIns
+
+ static void asm_sub(ASMState *as, IRIns *ir)
+ {
++#if !LJ_SOFTFP
+ if (irt_isnum(ir->t)) {
+ if (!asm_fusemadd(as, ir, PPCI_FMSUB, PPCI_FNMSUB))
+ asm_fparith(as, ir, PPCI_FSUB);
+- } else {
++ } else
++#endif
++ {
+ PPCIns pi = PPCI_SUBF;
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ Reg left, right;
+@@ -1220,9 +1334,12 @@ static void asm_sub(ASMState *as, IRIns
+
+ static void asm_mul(ASMState *as, IRIns *ir)
+ {
++#if !LJ_SOFTFP
+ if (irt_isnum(ir->t)) {
+ asm_fparith(as, ir, PPCI_FMUL);
+- } else {
++ } else
++#endif
++ {
+ PPCIns pi = PPCI_MULLW;
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+@@ -1250,9 +1367,12 @@ static void asm_mul(ASMState *as, IRIns
+
+ static void asm_neg(ASMState *as, IRIns *ir)
+ {
++#if !LJ_SOFTFP
+ if (irt_isnum(ir->t)) {
+ asm_fpunary(as, ir, PPCI_FNEG);
+- } else {
++ } else
++#endif
++ {
+ Reg dest, left;
+ PPCIns pi = PPCI_NEG;
+ if (as->flagmcp == as->mcp) {
+@@ -1563,9 +1683,40 @@ static void asm_bitshift(ASMState *as, I
+ PPCI_RLWINM|PPCF_MB(0)|PPCF_ME(31))
+ #define asm_bror(as, ir) lua_assert(0)
+
++#if LJ_SOFTFP
++static void asm_sfpmin_max(ASMState *as, IRIns *ir)
++{
++ CCallInfo ci = lj_ir_callinfo[IRCALL_softfp_cmp];
++ IRRef args[4];
++ MCLabel l_right, l_end;
++ Reg desthi = ra_dest(as, ir, RSET_GPR), destlo = ra_dest(as, ir+1, RSET_GPR);
++ Reg righthi, lefthi = ra_alloc2(as, ir, RSET_GPR);
++ Reg rightlo, leftlo = ra_alloc2(as, ir+1, RSET_GPR);
++ PPCCC cond = (IROp)ir->o == IR_MIN ? CC_EQ : CC_NE;
++ righthi = (lefthi >> 8); lefthi &= 255;
++ rightlo = (leftlo >> 8); leftlo &= 255;
++ args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1;
++ args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2;
++ l_end = emit_label(as);
++ if (desthi != righthi) emit_mr(as, desthi, righthi);
++ if (destlo != rightlo) emit_mr(as, destlo, rightlo);
++ l_right = emit_label(as);
++ if (l_end != l_right) emit_jmp(as, l_end);
++ if (desthi != lefthi) emit_mr(as, desthi, lefthi);
++ if (destlo != leftlo) emit_mr(as, destlo, leftlo);
++ if (l_right == as->mcp+1) {
++ cond ^= 4; l_right = l_end; ++as->mcp;
++ }
++ emit_condbranch(as, PPCI_BC, cond, l_right);
++ ra_evictset(as, RSET_SCRATCH);
++ emit_cmpi(as, RID_RET, 1);
++ asm_gencall(as, &ci, args);
++}
++#endif
++
+ static void asm_min_max(ASMState *as, IRIns *ir, int ismax)
+ {
+- if (irt_isnum(ir->t)) {
++ if (!LJ_SOFTFP && irt_isnum(ir->t)) {
+ Reg dest = ra_dest(as, ir, RSET_FPR);
+ Reg tmp = dest;
+ Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+@@ -1653,7 +1804,7 @@ static void asm_intcomp_(ASMState *as, I
+ static void asm_comp(ASMState *as, IRIns *ir)
+ {
+ PPCCC cc = asm_compmap[ir->o];
+- if (irt_isnum(ir->t)) {
++ if (!LJ_SOFTFP && irt_isnum(ir->t)) {
+ Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+ right = (left >> 8); left &= 255;
+ asm_guardcc(as, (cc >> 4));
+@@ -1674,6 +1825,44 @@ static void asm_comp(ASMState *as, IRIns
+
+ #define asm_equal(as, ir) asm_comp(as, ir)
+
++#if LJ_SOFTFP
++/* SFP comparisons. */
++static void asm_sfpcomp(ASMState *as, IRIns *ir)
++{
++ const CCallInfo *ci = &lj_ir_callinfo[IRCALL_softfp_cmp];
++ RegSet drop = RSET_SCRATCH;
++ Reg r;
++ IRRef args[4];
++ args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1;
++ args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2;
++
++ for (r = REGARG_FIRSTGPR; r <= REGARG_FIRSTGPR+3; r++) {
++ if (!rset_test(as->freeset, r) &&
++ regcost_ref(as->cost[r]) == args[r-REGARG_FIRSTGPR])
++ rset_clear(drop, r);
++ }
++ ra_evictset(as, drop);
++ asm_setupresult(as, ir, ci);
++ switch ((IROp)ir->o) {
++ case IR_ULT:
++ asm_guardcc(as, CC_EQ);
++ emit_ai(as, PPCI_CMPWI, RID_RET, 0);
++ case IR_ULE:
++ asm_guardcc(as, CC_EQ);
++ emit_ai(as, PPCI_CMPWI, RID_RET, 1);
++ break;
++ case IR_GE: case IR_GT:
++ asm_guardcc(as, CC_EQ);
++ emit_ai(as, PPCI_CMPWI, RID_RET, 2);
++ default:
++ asm_guardcc(as, (asm_compmap[ir->o] & 0xf));
++ emit_ai(as, PPCI_CMPWI, RID_RET, 0);
++ break;
++ }
++ asm_gencall(as, ci, args);
++}
++#endif
++
+ #if LJ_HASFFI
+ /* 64 bit integer comparisons. */
+ static void asm_comp64(ASMState *as, IRIns *ir)
+@@ -1703,19 +1892,36 @@ static void asm_comp64(ASMState *as, IRI
+ /* Hiword op of a split 64 bit op. Previous op must be the loword op. */
+ static void asm_hiop(ASMState *as, IRIns *ir)
+ {
+-#if LJ_HASFFI
++#if LJ_HASFFI || LJ_SOFTFP
+ /* HIOP is marked as a store because it needs its own DCE logic. */
+ int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
+ if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
+ if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
+ as->curins--; /* Always skip the CONV. */
++#if LJ_HASFFI && !LJ_SOFTFP
+ if (usehi || uselo)
+ asm_conv64(as, ir);
+ return;
++#endif
+ } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
+ as->curins--; /* Always skip the loword comparison. */
++#if LJ_SOFTFP
++ if (!irt_isint(ir->t)) {
++ asm_sfpcomp(as, ir-1);
++ return;
++ }
++#endif
++#if LJ_HASFFI
+ asm_comp64(as, ir);
++#endif
++ return;
++#if LJ_SOFTFP
++ } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) {
++ as->curins--; /* Always skip the loword min/max. */
++ if (uselo || usehi)
++ asm_sfpmin_max(as, ir-1);
+ return;
++#endif
+ } else if ((ir-1)->o == IR_XSTORE) {
+ as->curins--; /* Handle both stores here. */
+ if ((ir-1)->r != RID_SINK) {
+@@ -1726,14 +1932,27 @@ static void asm_hiop(ASMState *as, IRIns
+ }
+ if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
+ switch ((ir-1)->o) {
++#if LJ_HASFFI
+ case IR_ADD: as->curins--; asm_add64(as, ir); break;
+ case IR_SUB: as->curins--; asm_sub64(as, ir); break;
+ case IR_NEG: as->curins--; asm_neg64(as, ir); break;
++#endif
++#if LJ_SOFTFP
++ case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
++ case IR_STRTO:
++ if (!uselo)
++ ra_allocref(as, ir->op1, RSET_GPR); /* Mark lo op as used. */
++ break;
++#endif
+ case IR_CALLN:
++ case IR_CALLS:
+ case IR_CALLXS:
+ if (!uselo)
+ ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */
+ break;
++#if LJ_SOFTFP
++ case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR:
++#endif
+ case IR_CNEWI:
+ /* Nothing to do here. Handled by lo op itself. */
+ break;
+@@ -1797,8 +2016,19 @@ static void asm_stack_restore(ASMState *
+ if ((sn & SNAP_NORESTORE))
+ continue;
+ if (irt_isnum(ir->t)) {
++#if LJ_SOFTFP
++ Reg tmp;
++ RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
++ lua_assert(irref_isk(ref)); /* LJ_SOFTFP: must be a number constant. */
++ tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, allow);
++ emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?4:0));
++ if (rset_test(as->freeset, tmp+1)) allow = RID2RSET(tmp+1);
++ tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, allow);
++ emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?0:4));
++#else
+ Reg src = ra_alloc1(as, ref, RSET_FPR);
+ emit_fai(as, PPCI_STFD, src, RID_BASE, ofs);
++#endif
+ } else {
+ Reg type;
+ RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
+@@ -1811,6 +2041,10 @@ static void asm_stack_restore(ASMState *
+ if ((sn & (SNAP_CONT|SNAP_FRAME))) {
+ if (s == 0) continue; /* Do not overwrite link to previous frame. */
+ type = ra_allock(as, (int32_t)(*flinks--), allow);
++#if LJ_SOFTFP
++ } else if ((sn & SNAP_SOFTFPNUM)) {
++ type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
++#endif
+ } else {
+ type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+ }
+@@ -1947,14 +2181,15 @@ static Reg asm_setup_call_slots(ASMState
+ int nslots = 2, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+ asm_collectargs(as, ir, ci, args);
+ for (i = 0; i < nargs; i++)
+- if (args[i] && irt_isfp(IR(args[i])->t)) {
++ if (!LJ_SOFTFP && args[i] && irt_isfp(IR(args[i])->t)) {
+ if (nfpr > 0) nfpr--; else nslots = (nslots+3) & ~1;
+ } else {
+ if (ngpr > 0) ngpr--; else nslots++;
+ }
+ if (nslots > as->evenspill) /* Leave room for args in stack slots. */
+ as->evenspill = nslots;
+- return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
++ return (!LJ_SOFTFP && irt_isfp(ir->t)) ? REGSP_HINT(RID_FPRET) :
++ REGSP_HINT(RID_RET);
+ }
+
+ static void asm_setup_target(ASMState *as)