From: Rosen Penev Date: Tue, 21 Jun 2022 18:52:36 +0000 (-0700) Subject: luajit: backport softfloat ppc support X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=7be6cc19e0b5faec6c555ea95d3d6b34394fdd78;p=feed%2Fpackages.git luajit: backport softfloat ppc support Signed-off-by: Rosen Penev (cherry picked from commit 24c0007ea2561611776e50c8876a7b040ffd6fdc) --- diff --git a/lang/luajit/Makefile b/lang/luajit/Makefile index 72e73beff7..20f0b9fa3b 100644 --- a/lang/luajit/Makefile +++ b/lang/luajit/Makefile @@ -2,7 +2,7 @@ include $(TOPDIR)/rules.mk PKG_NAME:=luajit PKG_VERSION:=2.1.0-beta3 -PKG_RELEASE:=6 +PKG_RELEASE:=7 PKG_SOURCE:=LuaJIT-$(PKG_VERSION).tar.gz PKG_SOURCE_URL:=https://luajit.org/download @@ -24,7 +24,7 @@ define Package/luajit CATEGORY:=Languages TITLE:=LuaJIT URL:=https://www.luajit.org - DEPENDS:=@(i386||x86_64||arm||armeb||aarch64||(powerpc&&HAS_FPU)||mips||mipsel||mips64) + DEPENDS:=@(i386||x86_64||arm||armeb||aarch64||powerpc||mips||mipsel||mips64) endef define Package/luajit/description diff --git a/lang/luajit/patches/040-softfloat-ppc.patch b/lang/luajit/patches/040-softfloat-ppc.patch new file mode 100644 index 0000000000..c424b9e687 --- /dev/null +++ b/lang/luajit/patches/040-softfloat-ppc.patch @@ -0,0 +1,2742 @@ +From fd37da0d586c331b0008fbfd653a9659344fe76f Mon Sep 17 00:00:00 2001 +From: Mike Pall +Date: Wed, 26 Jul 2017 09:52:19 +0200 +Subject: [PATCH] PPC: Add soft-float support to interpreter. + +Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +Sponsored by Cisco Systems, Inc. +--- + src/host/buildvm_asm.c | 2 +- + src/lj_arch.h | 29 +- + src/lj_ccall.c | 38 +- + src/lj_ccall.h | 4 +- + src/lj_ccallback.c | 30 +- + src/lj_frame.h | 2 +- + src/lj_ircall.h | 2 +- + src/vm_ppc.dasc | 1249 +++++++++++++++++++++++++++++++++------- + 8 files changed, 1101 insertions(+), 255 deletions(-) + +--- a/src/host/buildvm_asm.c ++++ b/src/host/buildvm_asm.c +@@ -338,7 +338,7 @@ void emit_asm(BuildCtx *ctx) + #if !(LJ_TARGET_PS3 || LJ_TARGET_PSVITA) + fprintf(ctx->fp, "\t.section .note.GNU-stack,\"\"," ELFASM_PX "progbits\n"); + #endif +-#if LJ_TARGET_PPC && !LJ_TARGET_PS3 ++#if LJ_TARGET_PPC && !LJ_TARGET_PS3 && !LJ_ABI_SOFTFP + /* Hard-float ABI. */ + fprintf(ctx->fp, "\t.gnu_attribute 4, 1\n"); + #endif +--- a/src/lj_arch.h ++++ b/src/lj_arch.h +@@ -254,6 +254,29 @@ + #else + #define LJ_ARCH_BITS 32 + #define LJ_ARCH_NAME "ppc" ++ ++#if !defined(LJ_ARCH_HASFPU) ++#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) ++#define LJ_ARCH_HASFPU 0 ++#else ++#define LJ_ARCH_HASFPU 1 ++#endif ++#endif ++ ++#if !defined(LJ_ABI_SOFTFP) ++#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) ++#define LJ_ABI_SOFTFP 1 ++#else ++#define LJ_ABI_SOFTFP 0 ++#endif ++#endif ++#endif ++ ++#if LJ_ABI_SOFTFP ++#define LJ_ARCH_NOJIT 1 /* NYI */ ++#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL ++#else ++#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE + #endif + + #define LJ_TARGET_PPC 1 +@@ -262,7 +285,6 @@ + #define LJ_TARGET_MASKSHIFT 0 + #define LJ_TARGET_MASKROT 1 + #define LJ_TARGET_UNIFYROT 1 /* Want only IR_BROL. */ +-#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE + + #if LJ_TARGET_CONSOLE + #define LJ_ARCH_PPC32ON64 1 +@@ -415,16 +437,13 @@ + #error "No support for ILP32 model on ARM64" + #endif + #elif LJ_TARGET_PPC +-#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) +-#error "No support for PowerPC CPUs without double-precision FPU" +-#endif + #if !LJ_ARCH_PPC64 && LJ_ARCH_ENDIAN == LUAJIT_LE + #error "No support for little-endian PPC32" + #endif + #if LJ_ARCH_PPC64 + #error "No support for PowerPC 64 bit mode (yet)" + #endif +-#ifdef __NO_FPRS__ ++#if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT) + #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" + #endif + #elif LJ_TARGET_MIPS32 +--- a/src/lj_ccall.c ++++ b/src/lj_ccall.c +@@ -387,6 +387,24 @@ + #define CCALL_HANDLE_COMPLEXARG \ + /* Pass complex by value in 2 or 4 GPRs. */ + ++#define CCALL_HANDLE_GPR \ ++ /* Try to pass argument in GPRs. */ \ ++ if (n > 1) { \ ++ lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ ++ if (ctype_isinteger(d->info) || ctype_isfp(d->info)) \ ++ ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ ++ else if (ngpr + n > maxgpr) \ ++ ngpr = maxgpr; /* Prevent reordering. */ \ ++ } \ ++ if (ngpr + n <= maxgpr) { \ ++ dp = &cc->gpr[ngpr]; \ ++ ngpr += n; \ ++ goto done; \ ++ } \ ++ ++#if LJ_ABI_SOFTFP ++#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR ++#else + #define CCALL_HANDLE_REGARG \ + if (isfp) { /* Try to pass argument in FPRs. */ \ + if (nfpr + 1 <= CCALL_NARG_FPR) { \ +@@ -395,24 +413,16 @@ + d = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ \ + goto done; \ + } \ +- } else { /* Try to pass argument in GPRs. */ \ +- if (n > 1) { \ +- lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ +- if (ctype_isinteger(d->info)) \ +- ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ +- else if (ngpr + n > maxgpr) \ +- ngpr = maxgpr; /* Prevent reordering. */ \ +- } \ +- if (ngpr + n <= maxgpr) { \ +- dp = &cc->gpr[ngpr]; \ +- ngpr += n; \ +- goto done; \ +- } \ ++ } else { \ ++ CCALL_HANDLE_GPR \ + } ++#endif + ++#if !LJ_ABI_SOFTFP + #define CCALL_HANDLE_RET \ + if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ + ctr = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ ++#endif + + #elif LJ_TARGET_MIPS32 + /* -- MIPS o32 calling conventions ---------------------------------------- */ +@@ -1080,7 +1090,7 @@ static int ccall_set_args(lua_State *L, + } + if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */ + +-#if LJ_TARGET_X64 || LJ_TARGET_PPC ++#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) + cc->nfpr = nfpr; /* Required for vararg functions. */ + #endif + cc->nsp = nsp; +--- a/src/lj_ccall.h ++++ b/src/lj_ccall.h +@@ -86,9 +86,9 @@ typedef union FPRArg { + #elif LJ_TARGET_PPC + + #define CCALL_NARG_GPR 8 +-#define CCALL_NARG_FPR 8 ++#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 8) + #define CCALL_NRET_GPR 4 /* For complex double. */ +-#define CCALL_NRET_FPR 1 ++#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 1) + #define CCALL_SPS_EXTRA 4 + #define CCALL_SPS_FREE 0 + +--- a/src/lj_ccallback.c ++++ b/src/lj_ccallback.c +@@ -419,6 +419,23 @@ void lj_ccallback_mcode_free(CTState *ct + + #elif LJ_TARGET_PPC + ++#define CALLBACK_HANDLE_GPR \ ++ if (n > 1) { \ ++ lua_assert(((LJ_ABI_SOFTFP && ctype_isnum(cta->info)) || /* double. */ \ ++ ctype_isinteger(cta->info)) && n == 2); /* int64_t. */ \ ++ ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ ++ } \ ++ if (ngpr + n <= maxgpr) { \ ++ sp = &cts->cb.gpr[ngpr]; \ ++ ngpr += n; \ ++ goto done; \ ++ } ++ ++#if LJ_ABI_SOFTFP ++#define CALLBACK_HANDLE_REGARG \ ++ CALLBACK_HANDLE_GPR \ ++ UNUSED(isfp); ++#else + #define CALLBACK_HANDLE_REGARG \ + if (isfp) { \ + if (nfpr + 1 <= CCALL_NARG_FPR) { \ +@@ -427,20 +444,15 @@ void lj_ccallback_mcode_free(CTState *ct + goto done; \ + } \ + } else { /* Try to pass argument in GPRs. */ \ +- if (n > 1) { \ +- lua_assert(ctype_isinteger(cta->info) && n == 2); /* int64_t. */ \ +- ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ +- } \ +- if (ngpr + n <= maxgpr) { \ +- sp = &cts->cb.gpr[ngpr]; \ +- ngpr += n; \ +- goto done; \ +- } \ ++ CALLBACK_HANDLE_GPR \ + } ++#endif + ++#if !LJ_ABI_SOFTFP + #define CALLBACK_HANDLE_RET \ + if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ + *(double *)dp = *(float *)dp; /* FPRs always hold doubles. */ ++#endif + + #elif LJ_TARGET_MIPS32 + +--- a/src/lj_frame.h ++++ b/src/lj_frame.h +@@ -226,7 +226,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CAL + #define CFRAME_OFS_L 36 + #define CFRAME_OFS_PC 32 + #define CFRAME_OFS_MULTRES 28 +-#define CFRAME_SIZE 272 ++#define CFRAME_SIZE (LJ_ARCH_HASFPU ? 272 : 128) + #define CFRAME_SHIFT_MULTRES 3 + #endif + #elif LJ_TARGET_MIPS32 +--- a/src/lj_ircall.h ++++ b/src/lj_ircall.h +@@ -272,7 +272,7 @@ LJ_DATA const CCallInfo lj_ir_callinfo[I + #define fp64_f2l __aeabi_f2lz + #define fp64_f2ul __aeabi_f2ulz + #endif +-#elif LJ_TARGET_MIPS ++#elif LJ_TARGET_MIPS || LJ_TARGET_PPC + #define softfp_add __adddf3 + #define softfp_sub __subdf3 + #define softfp_mul __muldf3 +--- a/src/vm_ppc.dasc ++++ b/src/vm_ppc.dasc +@@ -103,6 +103,18 @@ + |// Fixed register assignments for the interpreter. + |// Don't use: r1 = sp, r2 and r13 = reserved (TOC, TLS or SDATA) + | ++|.macro .FPU, a, b ++|.if FPU ++| a, b ++|.endif ++|.endmacro ++| ++|.macro .FPU, a, b, c ++|.if FPU ++| a, b, c ++|.endif ++|.endmacro ++| + |// The following must be C callee-save (but BASE is often refetched). + |.define BASE, r14 // Base of current Lua stack frame. + |.define KBASE, r15 // Constants of current Lua function. +@@ -116,8 +128,10 @@ + |.define TISNUM, r22 + |.define TISNIL, r23 + |.define ZERO, r24 ++|.if FPU + |.define TOBIT, f30 // 2^52 + 2^51. + |.define TONUM, f31 // 2^52 + 2^51 + 2^31. ++|.endif + | + |// The following temporaries are not saved across C calls, except for RA. + |.define RA, r20 // Callee-save. +@@ -133,6 +147,7 @@ + | + |// Saved temporaries. + |.define SAVE0, r21 ++|.define SAVE1, r25 + | + |// Calling conventions. + |.define CARG1, r3 +@@ -141,8 +156,10 @@ + |.define CARG4, r6 // Overlaps TMP3. + |.define CARG5, r7 // Overlaps INS. + | ++|.if FPU + |.define FARG1, f1 + |.define FARG2, f2 ++|.endif + | + |.define CRET1, r3 + |.define CRET2, r4 +@@ -213,10 +230,16 @@ + |.endif + |.else + | ++|.if FPU + |.define SAVE_LR, 276(sp) + |.define CFRAME_SPACE, 272 // Delta for sp. + |// Back chain for sp: 272(sp) <-- sp entering interpreter + |.define SAVE_FPR_, 128 // .. 128+18*8: 64 bit FPR saves. ++|.else ++|.define SAVE_LR, 132(sp) ++|.define CFRAME_SPACE, 128 // Delta for sp. ++|// Back chain for sp: 128(sp) <-- sp entering interpreter ++|.endif + |.define SAVE_GPR_, 56 // .. 56+18*4: 32 bit GPR saves. + |.define SAVE_CR, 52(sp) // 32 bit CR save. + |.define SAVE_ERRF, 48(sp) // 32 bit C frame info. +@@ -226,16 +249,25 @@ + |.define SAVE_PC, 32(sp) + |.define SAVE_MULTRES, 28(sp) + |.define UNUSED1, 24(sp) ++|.if FPU + |.define TMPD_LO, 20(sp) + |.define TMPD_HI, 16(sp) + |.define TONUM_LO, 12(sp) + |.define TONUM_HI, 8(sp) ++|.else ++|.define SFSAVE_4, 20(sp) ++|.define SFSAVE_3, 16(sp) ++|.define SFSAVE_2, 12(sp) ++|.define SFSAVE_1, 8(sp) ++|.endif + |// Next frame lr: 4(sp) + |// Back chain for sp: 0(sp) <-- sp while in interpreter + | ++|.if FPU + |.define TMPD_BLO, 23(sp) + |.define TMPD, TMPD_HI + |.define TONUM_D, TONUM_HI ++|.endif + | + |.endif + | +@@ -245,7 +277,7 @@ + |.else + | stw r..reg, SAVE_GPR_+(reg-14)*4(sp) + |.endif +-| stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) ++| .FPU stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) + |.endmacro + |.macro rest_, reg + |.if GPR64 +@@ -253,7 +285,7 @@ + |.else + | lwz r..reg, SAVE_GPR_+(reg-14)*4(sp) + |.endif +-| lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) ++| .FPU lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) + |.endmacro + | + |.macro saveregs +@@ -323,6 +355,7 @@ + |// Trap for not-yet-implemented parts. + |.macro NYI; tw 4, sp, sp; .endmacro + | ++|.if FPU + |// int/FP conversions. + |.macro tonum_i, freg, reg + | xoris reg, reg, 0x8000 +@@ -346,6 +379,7 @@ + |.macro toint, reg, freg + | toint reg, freg, freg + |.endmacro ++|.endif + | + |//----------------------------------------------------------------------- + | +@@ -533,9 +567,19 @@ static void build_subroutines(BuildCtx * + | beq >2 + |1: + | addic. TMP1, TMP1, -8 ++ |.if FPU + | lfd f0, 0(RA) ++ |.else ++ | lwz CARG1, 0(RA) ++ | lwz CARG2, 4(RA) ++ |.endif + | addi RA, RA, 8 ++ |.if FPU + | stfd f0, 0(BASE) ++ |.else ++ | stw CARG1, 0(BASE) ++ | stw CARG2, 4(BASE) ++ |.endif + | addi BASE, BASE, 8 + | bney <1 + | +@@ -613,23 +657,23 @@ static void build_subroutines(BuildCtx * + | .toc ld TOCREG, SAVE_TOC + | li TISNUM, LJ_TISNUM // Setup type comparison constants. + | lp BASE, L->base +- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). ++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | lwz DISPATCH, L->glref // Setup pointer to dispatch table. + | li ZERO, 0 +- | stw TMP3, TMPD ++ | .FPU stw TMP3, TMPD + | li TMP1, LJ_TFALSE +- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). ++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | li TISNIL, LJ_TNIL + | li_vmstate INTERP +- | lfs TOBIT, TMPD ++ | .FPU lfs TOBIT, TMPD + | lwz PC, FRAME_PC(BASE) // Fetch PC of previous frame. + | la RA, -8(BASE) // Results start at BASE-8. +- | stw TMP3, TMPD ++ | .FPU stw TMP3, TMPD + | addi DISPATCH, DISPATCH, GG_G2DISP + | stw TMP1, 0(RA) // Prepend false to error message. + | li RD, 16 // 2 results: false + error message. + | st_vmstate +- | lfs TONUM, TMPD ++ | .FPU lfs TONUM, TMPD + | b ->vm_returnc + | + |//----------------------------------------------------------------------- +@@ -690,22 +734,22 @@ static void build_subroutines(BuildCtx * + | li TISNUM, LJ_TISNUM // Setup type comparison constants. + | lp TMP1, L->top + | lwz PC, FRAME_PC(BASE) +- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). ++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | stb CARG3, L->status +- | stw TMP3, TMPD +- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). +- | lfs TOBIT, TMPD ++ | .FPU stw TMP3, TMPD ++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). ++ | .FPU lfs TOBIT, TMPD + | sub RD, TMP1, BASE +- | stw TMP3, TMPD +- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) ++ | .FPU stw TMP3, TMPD ++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | addi RD, RD, 8 +- | stw TMP0, TONUM_HI ++ | .FPU stw TMP0, TONUM_HI + | li_vmstate INTERP + | li ZERO, 0 + | st_vmstate + | andix. TMP0, PC, FRAME_TYPE + | mr MULTRES, RD +- | lfs TONUM, TMPD ++ | .FPU lfs TONUM, TMPD + | li TISNIL, LJ_TNIL + | beq ->BC_RET_Z + | b ->vm_return +@@ -739,19 +783,19 @@ static void build_subroutines(BuildCtx * + | lp TMP2, L->base // TMP2 = old base (used in vmeta_call). + | li TISNUM, LJ_TISNUM // Setup type comparison constants. + | lp TMP1, L->top +- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). ++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | add PC, PC, BASE +- | stw TMP3, TMPD ++ | .FPU stw TMP3, TMPD + | li ZERO, 0 +- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). +- | lfs TOBIT, TMPD ++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). ++ | .FPU lfs TOBIT, TMPD + | sub PC, PC, TMP2 // PC = frame delta + frame type +- | stw TMP3, TMPD +- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) ++ | .FPU stw TMP3, TMPD ++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | sub NARGS8:RC, TMP1, BASE +- | stw TMP0, TONUM_HI ++ | .FPU stw TMP0, TONUM_HI + | li_vmstate INTERP +- | lfs TONUM, TMPD ++ | .FPU lfs TONUM, TMPD + | li TISNIL, LJ_TNIL + | st_vmstate + | +@@ -839,15 +883,30 @@ static void build_subroutines(BuildCtx * + | lwz INS, -4(PC) + | subi CARG2, RB, 16 + | decode_RB8 SAVE0, INS ++ |.if FPU + | lfd f0, 0(RA) ++ |.else ++ | lwz TMP2, 0(RA) ++ | lwz TMP3, 4(RA) ++ |.endif + | add TMP1, BASE, SAVE0 + | stp BASE, L->base + | cmplw TMP1, CARG2 + | sub CARG3, CARG2, TMP1 + | decode_RA8 RA, INS ++ |.if FPU + | stfd f0, 0(CARG2) ++ |.else ++ | stw TMP2, 0(CARG2) ++ | stw TMP3, 4(CARG2) ++ |.endif + | bney ->BC_CAT_Z ++ |.if FPU + | stfdx f0, BASE, RA ++ |.else ++ | stwux TMP2, RA, BASE ++ | stw TMP3, 4(RA) ++ |.endif + | b ->cont_nop + | + |//-- Table indexing metamethods ----------------------------------------- +@@ -900,9 +959,19 @@ static void build_subroutines(BuildCtx * + | // Returns TValue * (finished) or NULL (metamethod). + | cmplwi CRET1, 0 + | beq >3 ++ |.if FPU + | lfd f0, 0(CRET1) ++ |.else ++ | lwz TMP0, 0(CRET1) ++ | lwz TMP1, 4(CRET1) ++ |.endif + | ins_next1 ++ |.if FPU + | stfdx f0, BASE, RA ++ |.else ++ | stwux TMP0, RA, BASE ++ | stw TMP1, 4(RA) ++ |.endif + | ins_next2 + | + |3: // Call __index metamethod. +@@ -920,7 +989,12 @@ static void build_subroutines(BuildCtx * + | // Returns cTValue * or NULL. + | cmplwi CRET1, 0 + | beq >1 ++ |.if FPU + | lfd f14, 0(CRET1) ++ |.else ++ | lwz SAVE0, 0(CRET1) ++ | lwz SAVE1, 4(CRET1) ++ |.endif + | b ->BC_TGETR_Z + |1: + | stwx TISNIL, BASE, RA +@@ -975,11 +1049,21 @@ static void build_subroutines(BuildCtx * + | bl extern lj_meta_tset // (lua_State *L, TValue *o, TValue *k) + | // Returns TValue * (finished) or NULL (metamethod). + | cmplwi CRET1, 0 ++ |.if FPU + | lfdx f0, BASE, RA ++ |.else ++ | lwzux TMP2, RA, BASE ++ | lwz TMP3, 4(RA) ++ |.endif + | beq >3 + | // NOBARRIER: lj_meta_tset ensures the table is not black. + | ins_next1 ++ |.if FPU + | stfd f0, 0(CRET1) ++ |.else ++ | stw TMP2, 0(CRET1) ++ | stw TMP3, 4(CRET1) ++ |.endif + | ins_next2 + | + |3: // Call __newindex metamethod. +@@ -990,7 +1074,12 @@ static void build_subroutines(BuildCtx * + | add PC, TMP1, BASE + | lwz LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. + | li NARGS8:RC, 24 // 3 args for func(t, k, v) ++ |.if FPU + | stfd f0, 16(BASE) // Copy value to third argument. ++ |.else ++ | stw TMP2, 16(BASE) ++ | stw TMP3, 20(BASE) ++ |.endif + | b ->vm_call_dispatch_f + | + |->vmeta_tsetr: +@@ -998,7 +1087,12 @@ static void build_subroutines(BuildCtx * + | stw PC, SAVE_PC + | bl extern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key) + | // Returns TValue *. ++ |.if FPU + | stfd f14, 0(CRET1) ++ |.else ++ | stw SAVE0, 0(CRET1) ++ | stw SAVE1, 4(CRET1) ++ |.endif + | b ->cont_nop + | + |//-- Comparison metamethods --------------------------------------------- +@@ -1037,9 +1131,19 @@ static void build_subroutines(BuildCtx * + | + |->cont_ra: // RA = resultptr + | lwz INS, -4(PC) ++ |.if FPU + | lfd f0, 0(RA) ++ |.else ++ | lwz CARG1, 0(RA) ++ | lwz CARG2, 4(RA) ++ |.endif + | decode_RA8 TMP1, INS ++ |.if FPU + | stfdx f0, BASE, TMP1 ++ |.else ++ | stwux CARG1, TMP1, BASE ++ | stw CARG2, 4(TMP1) ++ |.endif + | b ->cont_nop + | + |->cont_condt: // RA = resultptr +@@ -1245,22 +1349,32 @@ static void build_subroutines(BuildCtx * + |.macro .ffunc_n, name + |->ff_ .. name: + | cmplwi NARGS8:RC, 8 +- | lwz CARG3, 0(BASE) ++ | lwz CARG1, 0(BASE) ++ |.if FPU + | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG2, 4(BASE) ++ |.endif + | blt ->fff_fallback +- | checknum CARG3; bge ->fff_fallback ++ | checknum CARG1; bge ->fff_fallback + |.endmacro + | + |.macro .ffunc_nn, name + |->ff_ .. name: + | cmplwi NARGS8:RC, 16 +- | lwz CARG3, 0(BASE) ++ | lwz CARG1, 0(BASE) ++ |.if FPU + | lfd FARG1, 0(BASE) +- | lwz CARG4, 8(BASE) ++ | lwz CARG3, 8(BASE) + | lfd FARG2, 8(BASE) ++ |.else ++ | lwz CARG2, 4(BASE) ++ | lwz CARG3, 8(BASE) ++ | lwz CARG4, 12(BASE) ++ |.endif + | blt ->fff_fallback ++ | checknum CARG1; bge ->fff_fallback + | checknum CARG3; bge ->fff_fallback +- | checknum CARG4; bge ->fff_fallback + |.endmacro + | + |// Inlined GC threshold check. Caveat: uses TMP0 and TMP1. +@@ -1281,14 +1395,21 @@ static void build_subroutines(BuildCtx * + | bge cr1, ->fff_fallback + | stw CARG3, 0(RA) + | addi RD, NARGS8:RC, 8 // Compute (nresults+1)*8. ++ | addi TMP1, BASE, 8 ++ | add TMP2, RA, NARGS8:RC + | stw CARG1, 4(RA) + | beq ->fff_res // Done if exactly 1 argument. +- | li TMP1, 8 +- | subi RC, RC, 8 + |1: +- | cmplw TMP1, RC +- | lfdx f0, BASE, TMP1 +- | stfdx f0, RA, TMP1 ++ | cmplw TMP1, TMP2 ++ |.if FPU ++ | lfd f0, 0(TMP1) ++ | stfd f0, 0(TMP1) ++ |.else ++ | lwz CARG1, 0(TMP1) ++ | lwz CARG2, 4(TMP1) ++ | stw CARG1, -8(TMP1) ++ | stw CARG2, -4(TMP1) ++ |.endif + | addi TMP1, TMP1, 8 + | bney <1 + | b ->fff_res +@@ -1303,8 +1424,14 @@ static void build_subroutines(BuildCtx * + | orc TMP1, TMP2, TMP0 + | addi TMP1, TMP1, ~LJ_TISNUM+1 + | slwi TMP1, TMP1, 3 ++ |.if FPU + | la TMP2, CFUNC:RB->upvalue + | lfdx FARG1, TMP2, TMP1 ++ |.else ++ | add TMP1, CFUNC:RB, TMP1 ++ | lwz CARG1, CFUNC:TMP1->upvalue[0].u32.hi ++ | lwz CARG2, CFUNC:TMP1->upvalue[0].u32.lo ++ |.endif + | b ->fff_resn + | + |//-- Base library: getters and setters --------------------------------- +@@ -1382,7 +1509,12 @@ static void build_subroutines(BuildCtx * + | mr CARG1, L + | bl extern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key) + | // Returns cTValue *. ++ |.if FPU + | lfd FARG1, 0(CRET1) ++ |.else ++ | lwz CARG2, 4(CRET1) ++ | lwz CARG1, 0(CRET1) // Caveat: CARG1 == CRET1. ++ |.endif + | b ->fff_resn + | + |//-- Base library: conversions ------------------------------------------ +@@ -1391,7 +1523,11 @@ static void build_subroutines(BuildCtx * + | // Only handles the number case inline (without a base argument). + | cmplwi NARGS8:RC, 8 + | lwz CARG1, 0(BASE) ++ |.if FPU + | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG2, 4(BASE) ++ |.endif + | bne ->fff_fallback // Exactly one argument. + | checknum CARG1; bgt ->fff_fallback + | b ->fff_resn +@@ -1442,12 +1578,23 @@ static void build_subroutines(BuildCtx * + | cmplwi CRET1, 0 + | li CARG3, LJ_TNIL + | beq ->fff_restv // End of traversal: return nil. +- | lfd f0, 8(BASE) // Copy key and value to results. + | la RA, -8(BASE) ++ |.if FPU ++ | lfd f0, 8(BASE) // Copy key and value to results. + | lfd f1, 16(BASE) + | stfd f0, 0(RA) +- | li RD, (2+1)*8 + | stfd f1, 8(RA) ++ |.else ++ | lwz CARG1, 8(BASE) ++ | lwz CARG2, 12(BASE) ++ | lwz CARG3, 16(BASE) ++ | lwz CARG4, 20(BASE) ++ | stw CARG1, 0(RA) ++ | stw CARG2, 4(RA) ++ | stw CARG3, 8(RA) ++ | stw CARG4, 12(RA) ++ |.endif ++ | li RD, (2+1)*8 + | b ->fff_res + | + |.ffunc_1 pairs +@@ -1456,17 +1603,32 @@ static void build_subroutines(BuildCtx * + | bne ->fff_fallback + #if LJ_52 + | lwz TAB:TMP2, TAB:CARG1->metatable ++ |.if FPU + | lfd f0, CFUNC:RB->upvalue[0] ++ |.else ++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi ++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo ++ |.endif + | cmplwi TAB:TMP2, 0 + | la RA, -8(BASE) + | bne ->fff_fallback + #else ++ |.if FPU + | lfd f0, CFUNC:RB->upvalue[0] ++ |.else ++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi ++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo ++ |.endif + | la RA, -8(BASE) + #endif + | stw TISNIL, 8(BASE) + | li RD, (3+1)*8 ++ |.if FPU + | stfd f0, 0(RA) ++ |.else ++ | stw TMP0, 0(RA) ++ | stw TMP1, 4(RA) ++ |.endif + | b ->fff_res + | + |.ffunc ipairs_aux +@@ -1512,14 +1674,24 @@ static void build_subroutines(BuildCtx * + | stfd FARG2, 0(RA) + |.endif + | ble >2 // Not in array part? ++ |.if FPU + | lwzx TMP2, TMP1, TMP3 + | lfdx f0, TMP1, TMP3 ++ |.else ++ | lwzux TMP2, TMP1, TMP3 ++ | lwz TMP3, 4(TMP1) ++ |.endif + |1: + | checknil TMP2 + | li RD, (0+1)*8 + | beq ->fff_res // End of iteration, return 0 results. + | li RD, (2+1)*8 ++ |.if FPU + | stfd f0, 8(RA) ++ |.else ++ | stw TMP2, 8(RA) ++ | stw TMP3, 12(RA) ++ |.endif + | b ->fff_res + |2: // Check for empty hash part first. Otherwise call C function. + | lwz TMP0, TAB:CARG1->hmask +@@ -1533,7 +1705,11 @@ static void build_subroutines(BuildCtx * + | li RD, (0+1)*8 + | beq ->fff_res + | lwz TMP2, 0(CRET1) ++ |.if FPU + | lfd f0, 0(CRET1) ++ |.else ++ | lwz TMP3, 4(CRET1) ++ |.endif + | b <1 + | + |.ffunc_1 ipairs +@@ -1542,12 +1718,22 @@ static void build_subroutines(BuildCtx * + | bne ->fff_fallback + #if LJ_52 + | lwz TAB:TMP2, TAB:CARG1->metatable ++ |.if FPU + | lfd f0, CFUNC:RB->upvalue[0] ++ |.else ++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi ++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo ++ |.endif + | cmplwi TAB:TMP2, 0 + | la RA, -8(BASE) + | bne ->fff_fallback + #else ++ |.if FPU + | lfd f0, CFUNC:RB->upvalue[0] ++ |.else ++ | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi ++ | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo ++ |.endif + | la RA, -8(BASE) + #endif + |.if DUALNUM +@@ -1557,7 +1743,12 @@ static void build_subroutines(BuildCtx * + |.endif + | stw ZERO, 12(BASE) + | li RD, (3+1)*8 ++ |.if FPU + | stfd f0, 0(RA) ++ |.else ++ | stw TMP0, 0(RA) ++ | stw TMP1, 4(RA) ++ |.endif + | b ->fff_res + | + |//-- Base library: catch errors ---------------------------------------- +@@ -1576,19 +1767,32 @@ static void build_subroutines(BuildCtx * + | + |.ffunc xpcall + | cmplwi NARGS8:RC, 16 +- | lwz CARG4, 8(BASE) ++ | lwz CARG3, 8(BASE) ++ |.if FPU + | lfd FARG2, 8(BASE) + | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG1, 0(BASE) ++ | lwz CARG2, 4(BASE) ++ | lwz CARG4, 12(BASE) ++ |.endif + | blt ->fff_fallback + | lbz TMP1, DISPATCH_GL(hookmask)(DISPATCH) + | mr TMP2, BASE +- | checkfunc CARG4; bne ->fff_fallback // Traceback must be a function. ++ | checkfunc CARG3; bne ->fff_fallback // Traceback must be a function. + | la BASE, 16(BASE) + | // Remember active hook before pcall. + | rlwinm TMP1, TMP1, 32-HOOK_ACTIVE_SHIFT, 31, 31 ++ |.if FPU + | stfd FARG2, 0(TMP2) // Swap function and traceback. +- | subi NARGS8:RC, NARGS8:RC, 16 + | stfd FARG1, 8(TMP2) ++ |.else ++ | stw CARG3, 0(TMP2) ++ | stw CARG4, 4(TMP2) ++ | stw CARG1, 8(TMP2) ++ | stw CARG2, 12(TMP2) ++ |.endif ++ | subi NARGS8:RC, NARGS8:RC, 16 + | addi PC, TMP1, 16+FRAME_PCALL + | b ->vm_call_dispatch + | +@@ -1631,9 +1835,21 @@ static void build_subroutines(BuildCtx * + | stp BASE, L->top + |2: // Move args to coroutine. + | cmpw TMP1, NARGS8:RC ++ |.if FPU + | lfdx f0, BASE, TMP1 ++ |.else ++ | add CARG3, BASE, TMP1 ++ | lwz TMP2, 0(CARG3) ++ | lwz TMP3, 4(CARG3) ++ |.endif + | beq >3 ++ |.if FPU + | stfdx f0, CARG2, TMP1 ++ |.else ++ | add CARG3, CARG2, TMP1 ++ | stw TMP2, 0(CARG3) ++ | stw TMP3, 4(CARG3) ++ |.endif + | addi TMP1, TMP1, 8 + | b <2 + |3: +@@ -1664,8 +1880,17 @@ static void build_subroutines(BuildCtx * + | stp TMP2, L:SAVE0->top // Clear coroutine stack. + |5: // Move results from coroutine. + | cmplw TMP1, TMP3 ++ |.if FPU + | lfdx f0, TMP2, TMP1 + | stfdx f0, BASE, TMP1 ++ |.else ++ | add CARG3, TMP2, TMP1 ++ | lwz CARG1, 0(CARG3) ++ | lwz CARG2, 4(CARG3) ++ | add CARG3, BASE, TMP1 ++ | stw CARG1, 0(CARG3) ++ | stw CARG2, 4(CARG3) ++ |.endif + | addi TMP1, TMP1, 8 + | bne <5 + |6: +@@ -1690,12 +1915,22 @@ static void build_subroutines(BuildCtx * + | andix. TMP0, PC, FRAME_TYPE + | la TMP3, -8(TMP3) + | li TMP1, LJ_TFALSE ++ |.if FPU + | lfd f0, 0(TMP3) ++ |.else ++ | lwz CARG1, 0(TMP3) ++ | lwz CARG2, 4(TMP3) ++ |.endif + | stp TMP3, L:SAVE0->top // Remove error from coroutine stack. + | li RD, (2+1)*8 + | stw TMP1, -8(BASE) // Prepend false to results. + | la RA, -8(BASE) ++ |.if FPU + | stfd f0, 0(BASE) // Copy error message. ++ |.else ++ | stw CARG1, 0(BASE) // Copy error message. ++ | stw CARG2, 4(BASE) ++ |.endif + | b <7 + |.else + | mr CARG1, L +@@ -1874,7 +2109,12 @@ static void build_subroutines(BuildCtx * + | lus CARG1, 0x8000 // -(2^31). + | beqy ->fff_resi + |5: ++ |.if FPU + | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG1, 0(BASE) ++ | lwz CARG2, 4(BASE) ++ |.endif + | blex func + | b ->fff_resn + |.endmacro +@@ -1898,10 +2138,14 @@ static void build_subroutines(BuildCtx * + | + |.ffunc math_log + | cmplwi NARGS8:RC, 8 +- | lwz CARG3, 0(BASE) +- | lfd FARG1, 0(BASE) ++ | lwz CARG1, 0(BASE) + | bne ->fff_fallback // Need exactly 1 argument. +- | checknum CARG3; bge ->fff_fallback ++ | checknum CARG1; bge ->fff_fallback ++ |.if FPU ++ | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG2, 4(BASE) ++ |.endif + | blex log + | b ->fff_resn + | +@@ -1923,17 +2167,24 @@ static void build_subroutines(BuildCtx * + |.if DUALNUM + |.ffunc math_ldexp + | cmplwi NARGS8:RC, 16 +- | lwz CARG3, 0(BASE) ++ | lwz TMP0, 0(BASE) ++ |.if FPU + | lfd FARG1, 0(BASE) +- | lwz CARG4, 8(BASE) ++ |.else ++ | lwz CARG1, 0(BASE) ++ | lwz CARG2, 4(BASE) ++ |.endif ++ | lwz TMP1, 8(BASE) + |.if GPR64 + | lwz CARG2, 12(BASE) +- |.else ++ |.elif FPU + | lwz CARG1, 12(BASE) ++ |.else ++ | lwz CARG3, 12(BASE) + |.endif + | blt ->fff_fallback +- | checknum CARG3; bge ->fff_fallback +- | checknum CARG4; bne ->fff_fallback ++ | checknum TMP0; bge ->fff_fallback ++ | checknum TMP1; bne ->fff_fallback + |.else + |.ffunc_nn math_ldexp + |.if GPR64 +@@ -1948,8 +2199,10 @@ static void build_subroutines(BuildCtx * + |.ffunc_n math_frexp + |.if GPR64 + | la CARG2, DISPATCH_GL(tmptv)(DISPATCH) +- |.else ++ |.elif FPU + | la CARG1, DISPATCH_GL(tmptv)(DISPATCH) ++ |.else ++ | la CARG3, DISPATCH_GL(tmptv)(DISPATCH) + |.endif + | lwz PC, FRAME_PC(BASE) + | blex frexp +@@ -1958,7 +2211,12 @@ static void build_subroutines(BuildCtx * + |.if not DUALNUM + | tonum_i FARG2, TMP1 + |.endif ++ |.if FPU + | stfd FARG1, 0(RA) ++ |.else ++ | stw CRET1, 0(RA) ++ | stw CRET2, 4(RA) ++ |.endif + | li RD, (2+1)*8 + |.if DUALNUM + | stw TISNUM, 8(RA) +@@ -1971,13 +2229,20 @@ static void build_subroutines(BuildCtx * + |.ffunc_n math_modf + |.if GPR64 + | la CARG2, -8(BASE) +- |.else ++ |.elif FPU + | la CARG1, -8(BASE) ++ |.else ++ | la CARG3, -8(BASE) + |.endif + | lwz PC, FRAME_PC(BASE) + | blex modf + | la RA, -8(BASE) ++ |.if FPU + | stfd FARG1, 0(BASE) ++ |.else ++ | stw CRET1, 0(BASE) ++ | stw CRET2, 4(BASE) ++ |.endif + | li RD, (2+1)*8 + | b ->fff_res + | +@@ -1985,13 +2250,13 @@ static void build_subroutines(BuildCtx * + |.if DUALNUM + | .ffunc_1 name + | checknum CARG3 +- | addi TMP1, BASE, 8 +- | add TMP2, BASE, NARGS8:RC ++ | addi SAVE0, BASE, 8 ++ | add SAVE1, BASE, NARGS8:RC + | bne >4 + |1: // Handle integers. +- | lwz CARG4, 0(TMP1) +- | cmplw cr1, TMP1, TMP2 +- | lwz CARG2, 4(TMP1) ++ | lwz CARG4, 0(SAVE0) ++ | cmplw cr1, SAVE0, SAVE1 ++ | lwz CARG2, 4(SAVE0) + | bge cr1, ->fff_resi + | checknum CARG4 + | xoris TMP0, CARG1, 0x8000 +@@ -2008,36 +2273,76 @@ static void build_subroutines(BuildCtx * + |.if GPR64 + | rldicl CARG1, CARG1, 0, 32 + |.endif +- | addi TMP1, TMP1, 8 ++ | addi SAVE0, SAVE0, 8 + | b <1 + |3: + | bge ->fff_fallback + | // Convert intermediate result to number and continue below. ++ |.if FPU + | tonum_i FARG1, CARG1 +- | lfd FARG2, 0(TMP1) ++ | lfd FARG2, 0(SAVE0) ++ |.else ++ | mr CARG2, CARG1 ++ | bl ->vm_sfi2d_1 ++ | lwz CARG3, 0(SAVE0) ++ | lwz CARG4, 4(SAVE0) ++ |.endif + | b >6 + |4: ++ |.if FPU + | lfd FARG1, 0(BASE) ++ |.else ++ | lwz CARG1, 0(BASE) ++ | lwz CARG2, 4(BASE) ++ |.endif + | bge ->fff_fallback + |5: // Handle numbers. +- | lwz CARG4, 0(TMP1) +- | cmplw cr1, TMP1, TMP2 +- | lfd FARG2, 0(TMP1) ++ | lwz CARG3, 0(SAVE0) ++ | cmplw cr1, SAVE0, SAVE1 ++ |.if FPU ++ | lfd FARG2, 0(SAVE0) ++ |.else ++ | lwz CARG4, 4(SAVE0) ++ |.endif + | bge cr1, ->fff_resn +- | checknum CARG4; bge >7 ++ | checknum CARG3; bge >7 + |6: ++ | addi SAVE0, SAVE0, 8 ++ |.if FPU + | fsub f0, FARG1, FARG2 +- | addi TMP1, TMP1, 8 + |.if ismax + | fsel FARG1, f0, FARG1, FARG2 + |.else + | fsel FARG1, f0, FARG2, FARG1 + |.endif ++ |.else ++ | stw CARG1, SFSAVE_1 ++ | stw CARG2, SFSAVE_2 ++ | stw CARG3, SFSAVE_3 ++ | stw CARG4, SFSAVE_4 ++ | blex __ledf2 ++ | cmpwi CRET1, 0 ++ |.if ismax ++ | blt >8 ++ |.else ++ | bge >8 ++ |.endif ++ | lwz CARG1, SFSAVE_1 ++ | lwz CARG2, SFSAVE_2 ++ | b <5 ++ |8: ++ | lwz CARG1, SFSAVE_3 ++ | lwz CARG2, SFSAVE_4 ++ |.endif + | b <5 + |7: // Convert integer to number and continue above. +- | lwz CARG2, 4(TMP1) ++ | lwz CARG3, 4(SAVE0) + | bne ->fff_fallback +- | tonum_i FARG2, CARG2 ++ |.if FPU ++ | tonum_i FARG2, CARG3 ++ |.else ++ | bl ->vm_sfi2d_2 ++ |.endif + | b <6 + |.else + | .ffunc_n name +@@ -2237,28 +2542,37 @@ static void build_subroutines(BuildCtx * + | + |.macro .ffunc_bit_op, name, ins + | .ffunc_bit name +- | addi TMP1, BASE, 8 +- | add TMP2, BASE, NARGS8:RC ++ | addi SAVE0, BASE, 8 ++ | add SAVE1, BASE, NARGS8:RC + |1: +- | lwz CARG4, 0(TMP1) +- | cmplw cr1, TMP1, TMP2 ++ | lwz CARG4, 0(SAVE0) ++ | cmplw cr1, SAVE0, SAVE1 + |.if DUALNUM +- | lwz CARG2, 4(TMP1) ++ | lwz CARG2, 4(SAVE0) + |.else +- | lfd FARG1, 0(TMP1) ++ | lfd FARG1, 0(SAVE0) + |.endif + | bgey cr1, ->fff_resi + | checknum CARG4 + |.if DUALNUM ++ |.if FPU + | bnel ->fff_bitop_fb + |.else ++ | beq >3 ++ | stw CARG1, SFSAVE_1 ++ | bl ->fff_bitop_fb ++ | mr CARG2, CARG1 ++ | lwz CARG1, SFSAVE_1 ++ |3: ++ |.endif ++ |.else + | fadd FARG1, FARG1, TOBIT + | bge ->fff_fallback + | stfd FARG1, TMPD + | lwz CARG2, TMPD_LO + |.endif + | ins CARG1, CARG1, CARG2 +- | addi TMP1, TMP1, 8 ++ | addi SAVE0, SAVE0, 8 + | b <1 + |.endmacro + | +@@ -2280,7 +2594,14 @@ static void build_subroutines(BuildCtx * + |.macro .ffunc_bit_sh, name, ins, shmod + |.if DUALNUM + | .ffunc_2 bit_..name ++ |.if FPU + | checknum CARG3; bnel ->fff_tobit_fb ++ |.else ++ | checknum CARG3; beq >1 ++ | bl ->fff_tobit_fb ++ | lwz CARG2, 12(BASE) // Conversion polluted CARG2. ++ |1: ++ |.endif + | // Note: no inline conversion from number for 2nd argument! + | checknum CARG4; bne ->fff_fallback + |.else +@@ -2317,27 +2638,77 @@ static void build_subroutines(BuildCtx * + |->fff_resn: + | lwz PC, FRAME_PC(BASE) + | la RA, -8(BASE) ++ |.if FPU + | stfd FARG1, -8(BASE) ++ |.else ++ | stw CARG1, -8(BASE) ++ | stw CARG2, -4(BASE) ++ |.endif + | b ->fff_res1 + | + |// Fallback FP number to bit conversion. + |->fff_tobit_fb: + |.if DUALNUM ++ |.if FPU + | lfd FARG1, 0(BASE) + | bgt ->fff_fallback + | fadd FARG1, FARG1, TOBIT + | stfd FARG1, TMPD + | lwz CARG1, TMPD_LO + | blr ++ |.else ++ | bgt ->fff_fallback ++ | mr CARG2, CARG1 ++ | mr CARG1, CARG3 ++ |// Modifies: CARG1, CARG2, TMP0, TMP1, TMP2. ++ |->vm_tobit: ++ | slwi TMP2, CARG1, 1 ++ | addis TMP2, TMP2, 0x0020 ++ | cmpwi TMP2, 0 ++ | bge >2 ++ | li TMP1, 0x3e0 ++ | srawi TMP2, TMP2, 21 ++ | not TMP1, TMP1 ++ | sub. TMP2, TMP1, TMP2 ++ | cmpwi cr7, CARG1, 0 ++ | blt >1 ++ | slwi TMP1, CARG1, 11 ++ | srwi TMP0, CARG2, 21 ++ | oris TMP1, TMP1, 0x8000 ++ | or TMP1, TMP1, TMP0 ++ | srw CARG1, TMP1, TMP2 ++ | bclr 4, 28 // Return if cr7[lt] == 0, no hint. ++ | neg CARG1, CARG1 ++ | blr ++ |1: ++ | addi TMP2, TMP2, 21 ++ | srw TMP1, CARG2, TMP2 ++ | slwi CARG2, CARG1, 12 ++ | subfic TMP2, TMP2, 20 ++ | slw TMP0, CARG2, TMP2 ++ | or CARG1, TMP1, TMP0 ++ | bclr 4, 28 // Return if cr7[lt] == 0, no hint. ++ | neg CARG1, CARG1 ++ | blr ++ |2: ++ | li CARG1, 0 ++ | blr ++ |.endif + |.endif + |->fff_bitop_fb: + |.if DUALNUM +- | lfd FARG1, 0(TMP1) ++ |.if FPU ++ | lfd FARG1, 0(SAVE0) + | bgt ->fff_fallback + | fadd FARG1, FARG1, TOBIT + | stfd FARG1, TMPD + | lwz CARG2, TMPD_LO + | blr ++ |.else ++ | bgt ->fff_fallback ++ | mr CARG1, CARG4 ++ | b ->vm_tobit ++ |.endif + |.endif + | + |//----------------------------------------------------------------------- +@@ -2530,10 +2901,21 @@ static void build_subroutines(BuildCtx * + | decode_RA8 RC, INS // Call base. + | beq >2 + |1: // Move results down. ++ |.if FPU + | lfd f0, 0(RA) ++ |.else ++ | lwz CARG1, 0(RA) ++ | lwz CARG2, 4(RA) ++ |.endif + | addic. TMP1, TMP1, -8 + | addi RA, RA, 8 ++ |.if FPU + | stfdx f0, BASE, RC ++ |.else ++ | add CARG3, BASE, RC ++ | stw CARG1, 0(CARG3) ++ | stw CARG2, 4(CARG3) ++ |.endif + | addi RC, RC, 8 + | bne <1 + |2: +@@ -2586,10 +2968,12 @@ static void build_subroutines(BuildCtx * + |//----------------------------------------------------------------------- + | + |.macro savex_, a, b, c, d ++ |.if FPU + | stfd f..a, 16+a*8(sp) + | stfd f..b, 16+b*8(sp) + | stfd f..c, 16+c*8(sp) + | stfd f..d, 16+d*8(sp) ++ |.endif + |.endmacro + | + |->vm_exit_handler: +@@ -2661,16 +3045,16 @@ static void build_subroutines(BuildCtx * + | lwz KBASE, PC2PROTO(k)(TMP1) + | // Setup type comparison constants. + | li TISNUM, LJ_TISNUM +- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). +- | stw TMP3, TMPD ++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). ++ | .FPU stw TMP3, TMPD + | li ZERO, 0 +- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). +- | lfs TOBIT, TMPD +- | stw TMP3, TMPD +- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) ++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). ++ | .FPU lfs TOBIT, TMPD ++ | .FPU stw TMP3, TMPD ++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | li TISNIL, LJ_TNIL +- | stw TMP0, TONUM_HI +- | lfs TONUM, TMPD ++ | .FPU stw TMP0, TONUM_HI ++ | .FPU lfs TONUM, TMPD + | // Modified copy of ins_next which handles function header dispatch, too. + | lwz INS, 0(PC) + | addi PC, PC, 4 +@@ -2715,7 +3099,35 @@ static void build_subroutines(BuildCtx * + |//-- Math helper functions ---------------------------------------------- + |//----------------------------------------------------------------------- + | +- |// NYI: Use internal implementations of floor, ceil, trunc. ++ |// NYI: Use internal implementations of floor, ceil, trunc, sfcmp. ++ | ++ |.macro sfi2d, AHI, ALO ++ |.if not FPU ++ | mr. AHI, ALO ++ | bclr 12, 2 // Handle zero first. ++ | srawi TMP0, ALO, 31 ++ | xor TMP1, ALO, TMP0 ++ | sub TMP1, TMP1, TMP0 // Absolute value in TMP1. ++ | cntlzw AHI, TMP1 ++ | andix. TMP0, TMP0, 0x800 // Mask sign bit. ++ | slw TMP1, TMP1, AHI // Align mantissa left with leading 1. ++ | subfic AHI, AHI, 0x3ff+31-1 // Exponent -1 in AHI. ++ | slwi ALO, TMP1, 21 ++ | or AHI, AHI, TMP0 // Sign | Exponent. ++ | srwi TMP1, TMP1, 11 ++ | slwi AHI, AHI, 20 // Align left. ++ | add AHI, AHI, TMP1 // Add mantissa, increment exponent. ++ | blr ++ |.endif ++ |.endmacro ++ | ++ |// Input: CARG2. Output: CARG1, CARG2. Temporaries: TMP0, TMP1. ++ |->vm_sfi2d_1: ++ | sfi2d CARG1, CARG2 ++ | ++ |// Input: CARG4. Output: CARG3, CARG4. Temporaries: TMP0, TMP1. ++ |->vm_sfi2d_2: ++ | sfi2d CARG3, CARG4 + | + |->vm_modi: + | divwo. TMP0, CARG1, CARG2 +@@ -2783,21 +3195,21 @@ static void build_subroutines(BuildCtx * + | addi DISPATCH, r12, GG_G2DISP + | stw r11, CTSTATE->cb.slot + | stw r3, CTSTATE->cb.gpr[0] +- | stfd f1, CTSTATE->cb.fpr[0] ++ | .FPU stfd f1, CTSTATE->cb.fpr[0] + | stw r4, CTSTATE->cb.gpr[1] +- | stfd f2, CTSTATE->cb.fpr[1] ++ | .FPU stfd f2, CTSTATE->cb.fpr[1] + | stw r5, CTSTATE->cb.gpr[2] +- | stfd f3, CTSTATE->cb.fpr[2] ++ | .FPU stfd f3, CTSTATE->cb.fpr[2] + | stw r6, CTSTATE->cb.gpr[3] +- | stfd f4, CTSTATE->cb.fpr[3] ++ | .FPU stfd f4, CTSTATE->cb.fpr[3] + | stw r7, CTSTATE->cb.gpr[4] +- | stfd f5, CTSTATE->cb.fpr[4] ++ | .FPU stfd f5, CTSTATE->cb.fpr[4] + | stw r8, CTSTATE->cb.gpr[5] +- | stfd f6, CTSTATE->cb.fpr[5] ++ | .FPU stfd f6, CTSTATE->cb.fpr[5] + | stw r9, CTSTATE->cb.gpr[6] +- | stfd f7, CTSTATE->cb.fpr[6] ++ | .FPU stfd f7, CTSTATE->cb.fpr[6] + | stw r10, CTSTATE->cb.gpr[7] +- | stfd f8, CTSTATE->cb.fpr[7] ++ | .FPU stfd f8, CTSTATE->cb.fpr[7] + | addi TMP0, sp, CFRAME_SPACE+8 + | stw TMP0, CTSTATE->cb.stack + | mr CARG1, CTSTATE +@@ -2808,21 +3220,21 @@ static void build_subroutines(BuildCtx * + | lp BASE, L:CRET1->base + | li TISNUM, LJ_TISNUM // Setup type comparison constants. + | lp RC, L:CRET1->top +- | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). ++ | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | li ZERO, 0 + | mr L, CRET1 +- | stw TMP3, TMPD +- | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) ++ | .FPU stw TMP3, TMPD ++ | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | lwz LFUNC:RB, FRAME_FUNC(BASE) +- | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). +- | stw TMP0, TONUM_HI ++ | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). ++ | .FPU stw TMP0, TONUM_HI + | li TISNIL, LJ_TNIL + | li_vmstate INTERP +- | lfs TOBIT, TMPD +- | stw TMP3, TMPD ++ | .FPU lfs TOBIT, TMPD ++ | .FPU stw TMP3, TMPD + | sub RC, RC, BASE + | st_vmstate +- | lfs TONUM, TMPD ++ | .FPU lfs TONUM, TMPD + | ins_callt + |.endif + | +@@ -2836,7 +3248,7 @@ static void build_subroutines(BuildCtx * + | mr CARG2, RA + | bl extern lj_ccallback_leave // (CTState *cts, TValue *o) + | lwz CRET1, CTSTATE->cb.gpr[0] +- | lfd FARG1, CTSTATE->cb.fpr[0] ++ | .FPU lfd FARG1, CTSTATE->cb.fpr[0] + | lwz CRET2, CTSTATE->cb.gpr[1] + | b ->vm_leave_unw + |.endif +@@ -2870,14 +3282,14 @@ static void build_subroutines(BuildCtx * + | bge <1 + |2: + | bney cr1, >3 +- | lfd f1, CCSTATE->fpr[0] +- | lfd f2, CCSTATE->fpr[1] +- | lfd f3, CCSTATE->fpr[2] +- | lfd f4, CCSTATE->fpr[3] +- | lfd f5, CCSTATE->fpr[4] +- | lfd f6, CCSTATE->fpr[5] +- | lfd f7, CCSTATE->fpr[6] +- | lfd f8, CCSTATE->fpr[7] ++ | .FPU lfd f1, CCSTATE->fpr[0] ++ | .FPU lfd f2, CCSTATE->fpr[1] ++ | .FPU lfd f3, CCSTATE->fpr[2] ++ | .FPU lfd f4, CCSTATE->fpr[3] ++ | .FPU lfd f5, CCSTATE->fpr[4] ++ | .FPU lfd f6, CCSTATE->fpr[5] ++ | .FPU lfd f7, CCSTATE->fpr[6] ++ | .FPU lfd f8, CCSTATE->fpr[7] + |3: + | lp TMP0, CCSTATE->func + | lwz CARG2, CCSTATE->gpr[1] +@@ -2894,7 +3306,7 @@ static void build_subroutines(BuildCtx * + | lwz TMP2, -4(r14) + | lwz TMP0, 4(r14) + | stw CARG1, CCSTATE:TMP1->gpr[0] +- | stfd FARG1, CCSTATE:TMP1->fpr[0] ++ | .FPU stfd FARG1, CCSTATE:TMP1->fpr[0] + | stw CARG2, CCSTATE:TMP1->gpr[1] + | mtlr TMP0 + | stw CARG3, CCSTATE:TMP1->gpr[2] +@@ -2923,19 +3335,19 @@ static void build_ins(BuildCtx *ctx, BCO + case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT: + | // RA = src1*8, RD = src2*8, JMP with RD = target + |.if DUALNUM +- | lwzux TMP0, RA, BASE ++ | lwzux CARG1, RA, BASE + | addi PC, PC, 4 + | lwz CARG2, 4(RA) +- | lwzux TMP1, RD, BASE ++ | lwzux CARG3, RD, BASE + | lwz TMP2, -4(PC) +- | checknum cr0, TMP0 +- | lwz CARG3, 4(RD) ++ | checknum cr0, CARG1 ++ | lwz CARG4, 4(RD) + | decode_RD4 TMP2, TMP2 +- | checknum cr1, TMP1 +- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) ++ | checknum cr1, CARG3 ++ | addis SAVE0, TMP2, -(BCBIAS_J*4 >> 16) + | bne cr0, >7 + | bne cr1, >8 +- | cmpw CARG2, CARG3 ++ | cmpw CARG2, CARG4 + if (op == BC_ISLT) { + | bge >2 + } else if (op == BC_ISGE) { +@@ -2946,28 +3358,41 @@ static void build_ins(BuildCtx *ctx, BCO + | ble >2 + } + |1: +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + |2: + | ins_next + | + |7: // RA is not an integer. + | bgt cr0, ->vmeta_comp + | // RA is a number. +- | lfd f0, 0(RA) ++ | .FPU lfd f0, 0(RA) + | bgt cr1, ->vmeta_comp + | blt cr1, >4 + | // RA is a number, RD is an integer. +- | tonum_i f1, CARG3 ++ |.if FPU ++ | tonum_i f1, CARG4 ++ |.else ++ | bl ->vm_sfi2d_2 ++ |.endif + | b >5 + | + |8: // RA is an integer, RD is not an integer. + | bgt cr1, ->vmeta_comp + | // RA is an integer, RD is a number. ++ |.if FPU + | tonum_i f0, CARG2 ++ |.else ++ | bl ->vm_sfi2d_1 ++ |.endif + |4: +- | lfd f1, 0(RD) ++ | .FPU lfd f1, 0(RD) + |5: ++ |.if FPU + | fcmpu cr0, f0, f1 ++ |.else ++ | blex __ledf2 ++ | cmpwi CRET1, 0 ++ |.endif + if (op == BC_ISLT) { + | bge <2 + } else if (op == BC_ISGE) { +@@ -3015,42 +3440,42 @@ static void build_ins(BuildCtx *ctx, BCO + vk = op == BC_ISEQV; + | // RA = src1*8, RD = src2*8, JMP with RD = target + |.if DUALNUM +- | lwzux TMP0, RA, BASE ++ | lwzux CARG1, RA, BASE + | addi PC, PC, 4 + | lwz CARG2, 4(RA) +- | lwzux TMP1, RD, BASE +- | checknum cr0, TMP0 +- | lwz TMP2, -4(PC) +- | checknum cr1, TMP1 +- | decode_RD4 TMP2, TMP2 +- | lwz CARG3, 4(RD) ++ | lwzux CARG3, RD, BASE ++ | checknum cr0, CARG1 ++ | lwz SAVE0, -4(PC) ++ | checknum cr1, CARG3 ++ | decode_RD4 SAVE0, SAVE0 ++ | lwz CARG4, 4(RD) + | cror 4*cr7+gt, 4*cr0+gt, 4*cr1+gt +- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) ++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) + if (vk) { + | ble cr7, ->BC_ISEQN_Z + } else { + | ble cr7, ->BC_ISNEN_Z + } + |.else +- | lwzux TMP0, RA, BASE +- | lwz TMP2, 0(PC) ++ | lwzux CARG1, RA, BASE ++ | lwz SAVE0, 0(PC) + | lfd f0, 0(RA) + | addi PC, PC, 4 +- | lwzux TMP1, RD, BASE +- | checknum cr0, TMP0 +- | decode_RD4 TMP2, TMP2 ++ | lwzux CARG3, RD, BASE ++ | checknum cr0, CARG1 ++ | decode_RD4 SAVE0, SAVE0 + | lfd f1, 0(RD) +- | checknum cr1, TMP1 +- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) ++ | checknum cr1, CARG3 ++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) + | bge cr0, >5 + | bge cr1, >5 + | fcmpu cr0, f0, f1 + if (vk) { + | bne >1 +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + } else { + | beq >1 +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + } + |1: + | ins_next +@@ -3058,36 +3483,36 @@ static void build_ins(BuildCtx *ctx, BCO + |5: // Either or both types are not numbers. + |.if not DUALNUM + | lwz CARG2, 4(RA) +- | lwz CARG3, 4(RD) ++ | lwz CARG4, 4(RD) + |.endif + |.if FFI +- | cmpwi cr7, TMP0, LJ_TCDATA +- | cmpwi cr5, TMP1, LJ_TCDATA ++ | cmpwi cr7, CARG1, LJ_TCDATA ++ | cmpwi cr5, CARG3, LJ_TCDATA + |.endif +- | not TMP3, TMP0 +- | cmplw TMP0, TMP1 +- | cmplwi cr1, TMP3, ~LJ_TISPRI // Primitive? ++ | not TMP2, CARG1 ++ | cmplw CARG1, CARG3 ++ | cmplwi cr1, TMP2, ~LJ_TISPRI // Primitive? + |.if FFI + | cror 4*cr7+eq, 4*cr7+eq, 4*cr5+eq + |.endif +- | cmplwi cr6, TMP3, ~LJ_TISTABUD // Table or userdata? ++ | cmplwi cr6, TMP2, ~LJ_TISTABUD // Table or userdata? + |.if FFI + | beq cr7, ->vmeta_equal_cd + |.endif +- | cmplw cr5, CARG2, CARG3 ++ | cmplw cr5, CARG2, CARG4 + | crandc 4*cr0+gt, 4*cr0+eq, 4*cr1+gt // 2: Same type and primitive. + | crorc 4*cr0+lt, 4*cr5+eq, 4*cr0+eq // 1: Same tv or different type. + | crand 4*cr0+eq, 4*cr0+eq, 4*cr5+eq // 0: Same type and same tv. +- | mr SAVE0, PC ++ | mr SAVE1, PC + | cror 4*cr0+eq, 4*cr0+eq, 4*cr0+gt // 0 or 2. + | cror 4*cr0+lt, 4*cr0+lt, 4*cr0+gt // 1 or 2. + if (vk) { + | bne cr0, >6 +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + |6: + } else { + | beq cr0, >6 +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + |6: + } + |.if DUALNUM +@@ -3102,6 +3527,7 @@ static void build_ins(BuildCtx *ctx, BCO + | + | // Different tables or userdatas. Need to check __eq metamethod. + | // Field metatable must be at same offset for GCtab and GCudata! ++ | mr CARG3, CARG4 + | lwz TAB:TMP2, TAB:CARG2->metatable + | li CARG4, 1-vk // ne = 0 or 1. + | cmplwi TAB:TMP2, 0 +@@ -3109,7 +3535,7 @@ static void build_ins(BuildCtx *ctx, BCO + | lbz TMP2, TAB:TMP2->nomm + | andix. TMP2, TMP2, 1<vmeta_equal // Handle __eq metamethod. + break; + +@@ -3150,16 +3576,16 @@ static void build_ins(BuildCtx *ctx, BCO + vk = op == BC_ISEQN; + | // RA = src*8, RD = num_const*8, JMP with RD = target + |.if DUALNUM +- | lwzux TMP0, RA, BASE ++ | lwzux CARG1, RA, BASE + | addi PC, PC, 4 + | lwz CARG2, 4(RA) +- | lwzux TMP1, RD, KBASE +- | checknum cr0, TMP0 +- | lwz TMP2, -4(PC) +- | checknum cr1, TMP1 +- | decode_RD4 TMP2, TMP2 +- | lwz CARG3, 4(RD) +- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) ++ | lwzux CARG3, RD, KBASE ++ | checknum cr0, CARG1 ++ | lwz SAVE0, -4(PC) ++ | checknum cr1, CARG3 ++ | decode_RD4 SAVE0, SAVE0 ++ | lwz CARG4, 4(RD) ++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) + if (vk) { + |->BC_ISEQN_Z: + } else { +@@ -3167,7 +3593,7 @@ static void build_ins(BuildCtx *ctx, BCO + } + | bne cr0, >7 + | bne cr1, >8 +- | cmpw CARG2, CARG3 ++ | cmpw CARG2, CARG4 + |4: + |.else + if (vk) { +@@ -3175,20 +3601,20 @@ static void build_ins(BuildCtx *ctx, BCO + } else { + |->BC_ISNEN_Z: // Dummy label. + } +- | lwzx TMP0, BASE, RA ++ | lwzx CARG1, BASE, RA + | addi PC, PC, 4 + | lfdx f0, BASE, RA +- | lwz TMP2, -4(PC) ++ | lwz SAVE0, -4(PC) + | lfdx f1, KBASE, RD +- | decode_RD4 TMP2, TMP2 +- | checknum TMP0 +- | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) ++ | decode_RD4 SAVE0, SAVE0 ++ | checknum CARG1 ++ | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) + | bge >3 + | fcmpu cr0, f0, f1 + |.endif + if (vk) { + | bne >1 +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + |1: + |.if not FFI + |3: +@@ -3199,13 +3625,13 @@ static void build_ins(BuildCtx *ctx, BCO + |.if not FFI + |3: + |.endif +- | add PC, PC, TMP2 ++ | add PC, PC, SAVE0 + |2: + } + | ins_next + |.if FFI + |3: +- | cmpwi TMP0, LJ_TCDATA ++ | cmpwi CARG1, LJ_TCDATA + | beq ->vmeta_equal_cd + | b <1 + |.endif +@@ -3213,18 +3639,31 @@ static void build_ins(BuildCtx *ctx, BCO + |7: // RA is not an integer. + | bge cr0, <3 + | // RA is a number. +- | lfd f0, 0(RA) ++ | .FPU lfd f0, 0(RA) + | blt cr1, >1 + | // RA is a number, RD is an integer. +- | tonum_i f1, CARG3 ++ |.if FPU ++ | tonum_i f1, CARG4 ++ |.else ++ | bl ->vm_sfi2d_2 ++ |.endif + | b >2 + | + |8: // RA is an integer, RD is a number. ++ |.if FPU + | tonum_i f0, CARG2 ++ |.else ++ | bl ->vm_sfi2d_1 ++ |.endif + |1: +- | lfd f1, 0(RD) ++ | .FPU lfd f1, 0(RD) + |2: ++ |.if FPU + | fcmpu cr0, f0, f1 ++ |.else ++ | blex __ledf2 ++ | cmpwi CRET1, 0 ++ |.endif + | b <4 + |.endif + break; +@@ -3279,7 +3718,12 @@ static void build_ins(BuildCtx *ctx, BCO + | add PC, PC, TMP2 + } else { + | li TMP1, LJ_TFALSE ++ |.if FPU + | lfdx f0, BASE, RD ++ |.else ++ | lwzux CARG1, RD, BASE ++ | lwz CARG2, 4(RD) ++ |.endif + | cmplw TMP0, TMP1 + if (op == BC_ISTC) { + | bge >1 +@@ -3288,7 +3732,12 @@ static void build_ins(BuildCtx *ctx, BCO + } + | addis PC, PC, -(BCBIAS_J*4 >> 16) + | decode_RD4 TMP2, INS ++ |.if FPU + | stfdx f0, BASE, RA ++ |.else ++ | stwux CARG1, RA, BASE ++ | stw CARG2, 4(RA) ++ |.endif + | add PC, PC, TMP2 + |1: + } +@@ -3323,8 +3772,15 @@ static void build_ins(BuildCtx *ctx, BCO + case BC_MOV: + | // RA = dst*8, RD = src*8 + | ins_next1 ++ |.if FPU + | lfdx f0, BASE, RD + | stfdx f0, BASE, RA ++ |.else ++ | lwzux TMP0, RD, BASE ++ | lwz TMP1, 4(RD) ++ | stwux TMP0, RA, BASE ++ | stw TMP1, 4(RA) ++ |.endif + | ins_next2 + break; + case BC_NOT: +@@ -3426,44 +3882,65 @@ static void build_ins(BuildCtx *ctx, BCO + ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); + ||switch (vk) { + ||case 0: +- | lwzx TMP1, BASE, RB ++ | lwzx CARG1, BASE, RB + | .if DUALNUM +- | lwzx TMP2, KBASE, RC ++ | lwzx CARG3, KBASE, RC + | .endif ++ | .if FPU + | lfdx f14, BASE, RB + | lfdx f15, KBASE, RC ++ | .else ++ | add TMP1, BASE, RB ++ | add TMP2, KBASE, RC ++ | lwz CARG2, 4(TMP1) ++ | lwz CARG4, 4(TMP2) ++ | .endif + | .if DUALNUM +- | checknum cr0, TMP1 +- | checknum cr1, TMP2 ++ | checknum cr0, CARG1 ++ | checknum cr1, CARG3 + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | bge ->vmeta_arith_vn + | .else +- | checknum TMP1; bge ->vmeta_arith_vn ++ | checknum CARG1; bge ->vmeta_arith_vn + | .endif + || break; + ||case 1: +- | lwzx TMP1, BASE, RB ++ | lwzx CARG1, BASE, RB + | .if DUALNUM +- | lwzx TMP2, KBASE, RC ++ | lwzx CARG3, KBASE, RC + | .endif ++ | .if FPU + | lfdx f15, BASE, RB + | lfdx f14, KBASE, RC ++ | .else ++ | add TMP1, BASE, RB ++ | add TMP2, KBASE, RC ++ | lwz CARG2, 4(TMP1) ++ | lwz CARG4, 4(TMP2) ++ | .endif + | .if DUALNUM +- | checknum cr0, TMP1 +- | checknum cr1, TMP2 ++ | checknum cr0, CARG1 ++ | checknum cr1, CARG3 + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | bge ->vmeta_arith_nv + | .else +- | checknum TMP1; bge ->vmeta_arith_nv ++ | checknum CARG1; bge ->vmeta_arith_nv + | .endif + || break; + ||default: +- | lwzx TMP1, BASE, RB +- | lwzx TMP2, BASE, RC ++ | lwzx CARG1, BASE, RB ++ | lwzx CARG3, BASE, RC ++ | .if FPU + | lfdx f14, BASE, RB + | lfdx f15, BASE, RC +- | checknum cr0, TMP1 +- | checknum cr1, TMP2 ++ | .else ++ | add TMP1, BASE, RB ++ | add TMP2, BASE, RC ++ | lwz CARG2, 4(TMP1) ++ | lwz CARG4, 4(TMP2) ++ | .endif ++ | checknum cr0, CARG1 ++ | checknum cr1, CARG3 + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | bge ->vmeta_arith_vv + || break; +@@ -3497,48 +3974,78 @@ static void build_ins(BuildCtx *ctx, BCO + | fsub a, b, a // b - floor(b/c)*c + |.endmacro + | ++ |.macro sfpmod ++ |->BC_MODVN_Z: ++ | stw CARG1, SFSAVE_1 ++ | stw CARG2, SFSAVE_2 ++ | mr SAVE0, CARG3 ++ | mr SAVE1, CARG4 ++ | blex __divdf3 ++ | blex floor ++ | mr CARG3, SAVE0 ++ | mr CARG4, SAVE1 ++ | blex __muldf3 ++ | mr CARG3, CRET1 ++ | mr CARG4, CRET2 ++ | lwz CARG1, SFSAVE_1 ++ | lwz CARG2, SFSAVE_2 ++ | blex __subdf3 ++ |.endmacro ++ | + |.macro ins_arithfp, fpins + | ins_arithpre + |.if "fpins" == "fpmod_" + | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. +- |.else ++ |.elif FPU + | fpins f0, f14, f15 + | ins_next1 + | stfdx f0, BASE, RA + | ins_next2 ++ |.else ++ | blex __divdf3 // Only soft-float div uses this macro. ++ | ins_next1 ++ | stwux CRET1, RA, BASE ++ | stw CRET2, 4(RA) ++ | ins_next2 + |.endif + |.endmacro + | +- |.macro ins_arithdn, intins, fpins ++ |.macro ins_arithdn, intins, fpins, fpcall + | // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8 + ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); + ||switch (vk) { + ||case 0: +- | lwzux TMP1, RB, BASE +- | lwzux TMP2, RC, KBASE +- | lwz CARG1, 4(RB) +- | checknum cr0, TMP1 +- | lwz CARG2, 4(RC) ++ | lwzux CARG1, RB, BASE ++ | lwzux CARG3, RC, KBASE ++ | lwz CARG2, 4(RB) ++ | checknum cr0, CARG1 ++ | lwz CARG4, 4(RC) ++ | checknum cr1, CARG3 + || break; + ||case 1: +- | lwzux TMP1, RB, BASE +- | lwzux TMP2, RC, KBASE +- | lwz CARG2, 4(RB) +- | checknum cr0, TMP1 +- | lwz CARG1, 4(RC) ++ | lwzux CARG3, RB, BASE ++ | lwzux CARG1, RC, KBASE ++ | lwz CARG4, 4(RB) ++ | checknum cr0, CARG3 ++ | lwz CARG2, 4(RC) ++ | checknum cr1, CARG1 + || break; + ||default: +- | lwzux TMP1, RB, BASE +- | lwzux TMP2, RC, BASE +- | lwz CARG1, 4(RB) +- | checknum cr0, TMP1 +- | lwz CARG2, 4(RC) ++ | lwzux CARG1, RB, BASE ++ | lwzux CARG3, RC, BASE ++ | lwz CARG2, 4(RB) ++ | checknum cr0, CARG1 ++ | lwz CARG4, 4(RC) ++ | checknum cr1, CARG3 + || break; + ||} +- | checknum cr1, TMP2 + | bne >5 + | bne cr1, >5 +- | intins CARG1, CARG1, CARG2 ++ |.if "intins" == "intmod" ++ | mr CARG1, CARG2 ++ | mr CARG2, CARG4 ++ |.endif ++ | intins CARG1, CARG2, CARG4 + | bso >4 + |1: + | ins_next1 +@@ -3550,29 +4057,40 @@ static void build_ins(BuildCtx *ctx, BCO + | checkov TMP0, <1 // Ignore unrelated overflow. + | ins_arithfallback b + |5: // FP variant. ++ |.if FPU + ||if (vk == 1) { + | lfd f15, 0(RB) +- | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | lfd f14, 0(RC) + ||} else { + | lfd f14, 0(RB) +- | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | lfd f15, 0(RC) + ||} ++ |.endif ++ | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | ins_arithfallback bge + |.if "fpins" == "fpmod_" + | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. + |.else ++ |.if FPU + | fpins f0, f14, f15 +- | ins_next1 + | stfdx f0, BASE, RA ++ |.else ++ |.if "fpcall" == "sfpmod" ++ | sfpmod ++ |.else ++ | blex fpcall ++ |.endif ++ | stwux CRET1, RA, BASE ++ | stw CRET2, 4(RA) ++ |.endif ++ | ins_next1 + | b <2 + |.endif + |.endmacro + | +- |.macro ins_arith, intins, fpins ++ |.macro ins_arith, intins, fpins, fpcall + |.if DUALNUM +- | ins_arithdn intins, fpins ++ | ins_arithdn intins, fpins, fpcall + |.else + | ins_arithfp fpins + |.endif +@@ -3587,9 +4105,9 @@ static void build_ins(BuildCtx *ctx, BCO + | addo. TMP0, TMP0, TMP3 + | add y, a, b + |.endmacro +- | ins_arith addo32., fadd ++ | ins_arith addo32., fadd, __adddf3 + |.else +- | ins_arith addo., fadd ++ | ins_arith addo., fadd, __adddf3 + |.endif + break; + case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: +@@ -3601,36 +4119,48 @@ static void build_ins(BuildCtx *ctx, BCO + | subo. TMP0, TMP0, TMP3 + | sub y, a, b + |.endmacro +- | ins_arith subo32., fsub ++ | ins_arith subo32., fsub, __subdf3 + |.else +- | ins_arith subo., fsub ++ | ins_arith subo., fsub, __subdf3 + |.endif + break; + case BC_MULVN: case BC_MULNV: case BC_MULVV: +- | ins_arith mullwo., fmul ++ | ins_arith mullwo., fmul, __muldf3 + break; + case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: + | ins_arithfp fdiv + break; + case BC_MODVN: +- | ins_arith intmod, fpmod ++ | ins_arith intmod, fpmod, sfpmod + break; + case BC_MODNV: case BC_MODVV: +- | ins_arith intmod, fpmod_ ++ | ins_arith intmod, fpmod_, sfpmod + break; + case BC_POW: + | // NYI: (partial) integer arithmetic. +- | lwzx TMP1, BASE, RB ++ | lwzx CARG1, BASE, RB ++ | lwzx CARG3, BASE, RC ++ |.if FPU + | lfdx FARG1, BASE, RB +- | lwzx TMP2, BASE, RC + | lfdx FARG2, BASE, RC +- | checknum cr0, TMP1 +- | checknum cr1, TMP2 ++ |.else ++ | add TMP1, BASE, RB ++ | add TMP2, BASE, RC ++ | lwz CARG2, 4(TMP1) ++ | lwz CARG4, 4(TMP2) ++ |.endif ++ | checknum cr0, CARG1 ++ | checknum cr1, CARG3 + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + | bge ->vmeta_arith_vv + | blex pow + | ins_next1 ++ |.if FPU + | stfdx FARG1, BASE, RA ++ |.else ++ | stwux CARG1, RA, BASE ++ | stw CARG2, 4(RA) ++ |.endif + | ins_next2 + break; + +@@ -3650,8 +4180,15 @@ static void build_ins(BuildCtx *ctx, BCO + | lp BASE, L->base + | bne ->vmeta_binop + | ins_next1 ++ |.if FPU + | lfdx f0, BASE, SAVE0 // Copy result from RB to RA. + | stfdx f0, BASE, RA ++ |.else ++ | lwzux TMP0, SAVE0, BASE ++ | lwz TMP1, 4(SAVE0) ++ | stwux TMP0, RA, BASE ++ | stw TMP1, 4(RA) ++ |.endif + | ins_next2 + break; + +@@ -3714,8 +4251,15 @@ static void build_ins(BuildCtx *ctx, BCO + case BC_KNUM: + | // RA = dst*8, RD = num_const*8 + | ins_next1 ++ |.if FPU + | lfdx f0, KBASE, RD + | stfdx f0, BASE, RA ++ |.else ++ | lwzux TMP0, RD, KBASE ++ | lwz TMP1, 4(RD) ++ | stwux TMP0, RA, BASE ++ | stw TMP1, 4(RA) ++ |.endif + | ins_next2 + break; + case BC_KPRI: +@@ -3748,8 +4292,15 @@ static void build_ins(BuildCtx *ctx, BCO + | lwzx UPVAL:RB, LFUNC:RB, RD + | ins_next1 + | lwz TMP1, UPVAL:RB->v ++ |.if FPU + | lfd f0, 0(TMP1) + | stfdx f0, BASE, RA ++ |.else ++ | lwz TMP2, 0(TMP1) ++ | lwz TMP3, 4(TMP1) ++ | stwux TMP2, RA, BASE ++ | stw TMP3, 4(RA) ++ |.endif + | ins_next2 + break; + case BC_USETV: +@@ -3757,14 +4308,24 @@ static void build_ins(BuildCtx *ctx, BCO + | lwz LFUNC:RB, FRAME_FUNC(BASE) + | srwi RA, RA, 1 + | addi RA, RA, offsetof(GCfuncL, uvptr) ++ |.if FPU + | lfdux f0, RD, BASE ++ |.else ++ | lwzux CARG1, RD, BASE ++ | lwz CARG3, 4(RD) ++ |.endif + | lwzx UPVAL:RB, LFUNC:RB, RA + | lbz TMP3, UPVAL:RB->marked + | lwz CARG2, UPVAL:RB->v + | andix. TMP3, TMP3, LJ_GC_BLACK // isblack(uv) + | lbz TMP0, UPVAL:RB->closed + | lwz TMP2, 0(RD) ++ |.if FPU + | stfd f0, 0(CARG2) ++ |.else ++ | stw CARG1, 0(CARG2) ++ | stw CARG3, 4(CARG2) ++ |.endif + | cmplwi cr1, TMP0, 0 + | lwz TMP1, 4(RD) + | cror 4*cr0+eq, 4*cr0+eq, 4*cr1+eq +@@ -3820,11 +4381,21 @@ static void build_ins(BuildCtx *ctx, BCO + | lwz LFUNC:RB, FRAME_FUNC(BASE) + | srwi RA, RA, 1 + | addi RA, RA, offsetof(GCfuncL, uvptr) ++ |.if FPU + | lfdx f0, KBASE, RD ++ |.else ++ | lwzux TMP2, RD, KBASE ++ | lwz TMP3, 4(RD) ++ |.endif + | lwzx UPVAL:RB, LFUNC:RB, RA + | ins_next1 + | lwz TMP1, UPVAL:RB->v ++ |.if FPU + | stfd f0, 0(TMP1) ++ |.else ++ | stw TMP2, 0(TMP1) ++ | stw TMP3, 4(TMP1) ++ |.endif + | ins_next2 + break; + case BC_USETP: +@@ -3972,11 +4543,21 @@ static void build_ins(BuildCtx *ctx, BCO + |.endif + | ble ->vmeta_tgetv // Integer key and in array part? + | lwzx TMP0, TMP1, TMP2 ++ |.if FPU + | lfdx f14, TMP1, TMP2 ++ |.else ++ | lwzux SAVE0, TMP1, TMP2 ++ | lwz SAVE1, 4(TMP1) ++ |.endif + | checknil TMP0; beq >2 + |1: + | ins_next1 ++ |.if FPU + | stfdx f14, BASE, RA ++ |.else ++ | stwux SAVE0, RA, BASE ++ | stw SAVE1, 4(RA) ++ |.endif + | ins_next2 + | + |2: // Check for __index if table value is nil. +@@ -4052,12 +4633,22 @@ static void build_ins(BuildCtx *ctx, BCO + | lwz TMP1, TAB:RB->asize + | lwz TMP2, TAB:RB->array + | cmplw TMP0, TMP1; bge ->vmeta_tgetb ++ |.if FPU + | lwzx TMP1, TMP2, RC + | lfdx f0, TMP2, RC ++ |.else ++ | lwzux TMP1, TMP2, RC ++ | lwz TMP3, 4(TMP2) ++ |.endif + | checknil TMP1; beq >5 + |1: + | ins_next1 ++ |.if FPU + | stfdx f0, BASE, RA ++ |.else ++ | stwux TMP1, RA, BASE ++ | stw TMP3, 4(RA) ++ |.endif + | ins_next2 + | + |5: // Check for __index if table value is nil. +@@ -4087,10 +4678,20 @@ static void build_ins(BuildCtx *ctx, BCO + | cmplw TMP0, CARG2 + | slwi TMP2, CARG2, 3 + | ble ->vmeta_tgetr // In array part? ++ |.if FPU + | lfdx f14, TMP1, TMP2 ++ |.else ++ | lwzux SAVE0, TMP2, TMP1 ++ | lwz SAVE1, 4(TMP2) ++ |.endif + |->BC_TGETR_Z: + | ins_next1 ++ |.if FPU + | stfdx f14, BASE, RA ++ |.else ++ | stwux SAVE0, RA, BASE ++ | stw SAVE1, 4(RA) ++ |.endif + | ins_next2 + break; + +@@ -4131,11 +4732,22 @@ static void build_ins(BuildCtx *ctx, BCO + | ble ->vmeta_tsetv // Integer key and in array part? + | lwzx TMP2, TMP1, TMP0 + | lbz TMP3, TAB:RB->marked ++ |.if FPU + | lfdx f14, BASE, RA ++ |.else ++ | add SAVE1, BASE, RA ++ | lwz SAVE0, 0(SAVE1) ++ | lwz SAVE1, 4(SAVE1) ++ |.endif + | checknil TMP2; beq >3 + |1: + | andix. TMP2, TMP3, LJ_GC_BLACK // isblack(table) ++ |.if FPU + | stfdx f14, TMP1, TMP0 ++ |.else ++ | stwux SAVE0, TMP1, TMP0 ++ | stw SAVE1, 4(TMP1) ++ |.endif + | bne >7 + |2: + | ins_next +@@ -4176,7 +4788,13 @@ static void build_ins(BuildCtx *ctx, BCO + | lwz NODE:TMP2, TAB:RB->node + | stb ZERO, TAB:RB->nomm // Clear metamethod cache. + | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask ++ |.if FPU + | lfdx f14, BASE, RA ++ |.else ++ | add CARG2, BASE, RA ++ | lwz SAVE0, 0(CARG2) ++ | lwz SAVE1, 4(CARG2) ++ |.endif + | slwi TMP0, TMP1, 5 + | slwi TMP1, TMP1, 3 + | sub TMP1, TMP0, TMP1 +@@ -4192,7 +4810,12 @@ static void build_ins(BuildCtx *ctx, BCO + | checknil CARG2; beq >4 // Key found, but nil value? + |2: + | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) ++ |.if FPU + | stfd f14, NODE:TMP2->val ++ |.else ++ | stw SAVE0, NODE:TMP2->val.u32.hi ++ | stw SAVE1, NODE:TMP2->val.u32.lo ++ |.endif + | bne >7 + |3: + | ins_next +@@ -4231,7 +4854,12 @@ static void build_ins(BuildCtx *ctx, BCO + | bl extern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k) + | // Returns TValue *. + | lp BASE, L->base ++ |.if FPU + | stfd f14, 0(CRET1) ++ |.else ++ | stw SAVE0, 0(CRET1) ++ | stw SAVE1, 4(CRET1) ++ |.endif + | b <3 // No 2nd write barrier needed. + | + |7: // Possible table write barrier for the value. Skip valiswhite check. +@@ -4248,13 +4876,24 @@ static void build_ins(BuildCtx *ctx, BCO + | lwz TMP2, TAB:RB->array + | lbz TMP3, TAB:RB->marked + | cmplw TMP0, TMP1 ++ |.if FPU + | lfdx f14, BASE, RA ++ |.else ++ | add CARG2, BASE, RA ++ | lwz SAVE0, 0(CARG2) ++ | lwz SAVE1, 4(CARG2) ++ |.endif + | bge ->vmeta_tsetb + | lwzx TMP1, TMP2, RC + | checknil TMP1; beq >5 + |1: + | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) ++ |.if FPU + | stfdx f14, TMP2, RC ++ |.else ++ | stwux SAVE0, RC, TMP2 ++ | stw SAVE1, 4(RC) ++ |.endif + | bne >7 + |2: + | ins_next +@@ -4294,10 +4933,20 @@ static void build_ins(BuildCtx *ctx, BCO + |2: + | cmplw TMP0, CARG3 + | slwi TMP2, CARG3, 3 ++ |.if FPU + | lfdx f14, BASE, RA ++ |.else ++ | lwzux SAVE0, RA, BASE ++ | lwz SAVE1, 4(RA) ++ |.endif + | ble ->vmeta_tsetr // In array part? + | ins_next1 ++ |.if FPU + | stfdx f14, TMP1, TMP2 ++ |.else ++ | stwux SAVE0, TMP1, TMP2 ++ | stw SAVE1, 4(TMP1) ++ |.endif + | ins_next2 + | + |7: // Possible table write barrier for the value. Skip valiswhite check. +@@ -4327,10 +4976,20 @@ static void build_ins(BuildCtx *ctx, BCO + | add TMP1, TMP1, TMP0 + | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |3: // Copy result slots to table. ++ |.if FPU + | lfd f0, 0(RA) ++ |.else ++ | lwz SAVE0, 0(RA) ++ | lwz SAVE1, 4(RA) ++ |.endif + | addi RA, RA, 8 + | cmpw cr1, RA, TMP2 ++ |.if FPU + | stfd f0, 0(TMP1) ++ |.else ++ | stw SAVE0, 0(TMP1) ++ | stw SAVE1, 4(TMP1) ++ |.endif + | addi TMP1, TMP1, 8 + | blt cr1, <3 + | bne >7 +@@ -4397,9 +5056,20 @@ static void build_ins(BuildCtx *ctx, BCO + | beq cr1, >3 + |2: + | addi TMP3, TMP2, 8 ++ |.if FPU + | lfdx f0, RA, TMP2 ++ |.else ++ | add CARG3, RA, TMP2 ++ | lwz CARG1, 0(CARG3) ++ | lwz CARG2, 4(CARG3) ++ |.endif + | cmplw cr1, TMP3, NARGS8:RC ++ |.if FPU + | stfdx f0, BASE, TMP2 ++ |.else ++ | stwux CARG1, TMP2, BASE ++ | stw CARG2, 4(TMP2) ++ |.endif + | mr TMP2, TMP3 + | bne cr1, <2 + |3: +@@ -4432,14 +5102,28 @@ static void build_ins(BuildCtx *ctx, BCO + | add BASE, BASE, RA + | lwz TMP1, -24(BASE) + | lwz LFUNC:RB, -20(BASE) ++ |.if FPU + | lfd f1, -8(BASE) + | lfd f0, -16(BASE) ++ |.else ++ | lwz CARG1, -8(BASE) ++ | lwz CARG2, -4(BASE) ++ | lwz CARG3, -16(BASE) ++ | lwz CARG4, -12(BASE) ++ |.endif + | stw TMP1, 0(BASE) // Copy callable. + | stw LFUNC:RB, 4(BASE) + | checkfunc TMP1 +- | stfd f1, 16(BASE) // Copy control var. + | li NARGS8:RC, 16 // Iterators get 2 arguments. ++ |.if FPU ++ | stfd f1, 16(BASE) // Copy control var. + | stfdu f0, 8(BASE) // Copy state. ++ |.else ++ | stw CARG1, 16(BASE) // Copy control var. ++ | stw CARG2, 20(BASE) ++ | stwu CARG3, 8(BASE) // Copy state. ++ | stw CARG4, 4(BASE) ++ |.endif + | bne ->vmeta_call + | ins_call + break; +@@ -4460,7 +5144,12 @@ static void build_ins(BuildCtx *ctx, BCO + | slwi TMP3, RC, 3 + | bge >5 // Index points after array part? + | lwzx TMP2, TMP1, TMP3 ++ |.if FPU + | lfdx f0, TMP1, TMP3 ++ |.else ++ | lwzux CARG1, TMP3, TMP1 ++ | lwz CARG2, 4(TMP3) ++ |.endif + | checknil TMP2 + | lwz INS, -4(PC) + | beq >4 +@@ -4472,7 +5161,12 @@ static void build_ins(BuildCtx *ctx, BCO + |.endif + | addi RC, RC, 1 + | addis TMP3, PC, -(BCBIAS_J*4 >> 16) ++ |.if FPU + | stfd f0, 8(RA) ++ |.else ++ | stw CARG1, 8(RA) ++ | stw CARG2, 12(RA) ++ |.endif + | decode_RD4 TMP1, INS + | stw RC, -4(RA) // Update control var. + | add PC, TMP1, TMP3 +@@ -4497,17 +5191,38 @@ static void build_ins(BuildCtx *ctx, BCO + | slwi RB, RC, 3 + | sub TMP3, TMP3, RB + | lwzx RB, TMP2, TMP3 ++ |.if FPU + | lfdx f0, TMP2, TMP3 ++ |.else ++ | add CARG3, TMP2, TMP3 ++ | lwz CARG1, 0(CARG3) ++ | lwz CARG2, 4(CARG3) ++ |.endif + | add NODE:TMP3, TMP2, TMP3 + | checknil RB + | lwz INS, -4(PC) + | beq >7 ++ |.if FPU + | lfd f1, NODE:TMP3->key ++ |.else ++ | lwz CARG3, NODE:TMP3->key.u32.hi ++ | lwz CARG4, NODE:TMP3->key.u32.lo ++ |.endif + | addis TMP2, PC, -(BCBIAS_J*4 >> 16) ++ |.if FPU + | stfd f0, 8(RA) ++ |.else ++ | stw CARG1, 8(RA) ++ | stw CARG2, 12(RA) ++ |.endif + | add RC, RC, TMP0 + | decode_RD4 TMP1, INS ++ |.if FPU + | stfd f1, 0(RA) ++ |.else ++ | stw CARG3, 0(RA) ++ | stw CARG4, 4(RA) ++ |.endif + | addi RC, RC, 1 + | add PC, TMP1, TMP2 + | stw RC, -4(RA) // Update control var. +@@ -4573,9 +5288,19 @@ static void build_ins(BuildCtx *ctx, BCO + | subi TMP2, TMP2, 16 + | ble >2 // No vararg slots? + |1: // Copy vararg slots to destination slots. ++ |.if FPU + | lfd f0, 0(RC) ++ |.else ++ | lwz CARG1, 0(RC) ++ | lwz CARG2, 4(RC) ++ |.endif + | addi RC, RC, 8 ++ |.if FPU + | stfd f0, 0(RA) ++ |.else ++ | stw CARG1, 0(RA) ++ | stw CARG2, 4(RA) ++ |.endif + | cmplw RA, TMP2 + | cmplw cr1, RC, TMP3 + | bge >3 // All destination slots filled? +@@ -4598,9 +5323,19 @@ static void build_ins(BuildCtx *ctx, BCO + | addi MULTRES, TMP1, 8 + | bgt >7 + |6: ++ |.if FPU + | lfd f0, 0(RC) ++ |.else ++ | lwz CARG1, 0(RC) ++ | lwz CARG2, 4(RC) ++ |.endif + | addi RC, RC, 8 ++ |.if FPU + | stfd f0, 0(RA) ++ |.else ++ | stw CARG1, 0(RA) ++ | stw CARG2, 4(RA) ++ |.endif + | cmplw RC, TMP3 + | addi RA, RA, 8 + | blt <6 // More vararg slots? +@@ -4651,14 +5386,38 @@ static void build_ins(BuildCtx *ctx, BCO + | li TMP1, 0 + |2: + | addi TMP3, TMP1, 8 ++ |.if FPU + | lfdx f0, RA, TMP1 ++ |.else ++ | add CARG3, RA, TMP1 ++ | lwz CARG1, 0(CARG3) ++ | lwz CARG2, 4(CARG3) ++ |.endif + | cmpw TMP3, RC ++ |.if FPU + | stfdx f0, TMP2, TMP1 ++ |.else ++ | add CARG3, TMP2, TMP1 ++ | stw CARG1, 0(CARG3) ++ | stw CARG2, 4(CARG3) ++ |.endif + | beq >3 + | addi TMP1, TMP3, 8 ++ |.if FPU + | lfdx f1, RA, TMP3 ++ |.else ++ | add CARG3, RA, TMP3 ++ | lwz CARG1, 0(CARG3) ++ | lwz CARG2, 4(CARG3) ++ |.endif + | cmpw TMP1, RC ++ |.if FPU + | stfdx f1, TMP2, TMP3 ++ |.else ++ | add CARG3, TMP2, TMP3 ++ | stw CARG1, 0(CARG3) ++ | stw CARG2, 4(CARG3) ++ |.endif + | bne <2 + |3: + |5: +@@ -4700,8 +5459,15 @@ static void build_ins(BuildCtx *ctx, BCO + | subi TMP2, BASE, 8 + | decode_RB8 RB, INS + if (op == BC_RET1) { ++ |.if FPU + | lfd f0, 0(RA) + | stfd f0, 0(TMP2) ++ |.else ++ | lwz CARG1, 0(RA) ++ | lwz CARG2, 4(RA) ++ | stw CARG1, 0(TMP2) ++ | stw CARG2, 4(TMP2) ++ |.endif + } + |5: + | cmplw RB, RD +@@ -4762,11 +5528,11 @@ static void build_ins(BuildCtx *ctx, BCO + |4: + | stw CARG1, FORL_IDX*8+4(RA) + } else { +- | lwz TMP3, FORL_STEP*8(RA) ++ | lwz SAVE0, FORL_STEP*8(RA) + | lwz CARG3, FORL_STEP*8+4(RA) + | lwz TMP2, FORL_STOP*8(RA) + | lwz CARG2, FORL_STOP*8+4(RA) +- | cmplw cr7, TMP3, TISNUM ++ | cmplw cr7, SAVE0, TISNUM + | cmplw cr1, TMP2, TISNUM + | crand 4*cr0+eq, 4*cr0+eq, 4*cr7+eq + | crand 4*cr0+eq, 4*cr0+eq, 4*cr1+eq +@@ -4809,41 +5575,80 @@ static void build_ins(BuildCtx *ctx, BCO + if (vk) { + |.if DUALNUM + |9: // FP loop. ++ |.if FPU + | lfd f1, FORL_IDX*8(RA) + |.else ++ | lwz CARG1, FORL_IDX*8(RA) ++ | lwz CARG2, FORL_IDX*8+4(RA) ++ |.endif ++ |.else + | lfdux f1, RA, BASE + |.endif ++ |.if FPU + | lfd f3, FORL_STEP*8(RA) + | lfd f2, FORL_STOP*8(RA) +- | lwz TMP3, FORL_STEP*8(RA) + | fadd f1, f1, f3 + | stfd f1, FORL_IDX*8(RA) ++ |.else ++ | lwz CARG3, FORL_STEP*8(RA) ++ | lwz CARG4, FORL_STEP*8+4(RA) ++ | mr SAVE1, RD ++ | blex __adddf3 ++ | mr RD, SAVE1 ++ | stw CRET1, FORL_IDX*8(RA) ++ | stw CRET2, FORL_IDX*8+4(RA) ++ | lwz CARG3, FORL_STOP*8(RA) ++ | lwz CARG4, FORL_STOP*8+4(RA) ++ |.endif ++ | lwz SAVE0, FORL_STEP*8(RA) + } else { + |.if DUALNUM + |9: // FP loop. + |.else + | lwzux TMP1, RA, BASE +- | lwz TMP3, FORL_STEP*8(RA) ++ | lwz SAVE0, FORL_STEP*8(RA) + | lwz TMP2, FORL_STOP*8(RA) + | cmplw cr0, TMP1, TISNUM +- | cmplw cr7, TMP3, TISNUM ++ | cmplw cr7, SAVE0, TISNUM + | cmplw cr1, TMP2, TISNUM + |.endif ++ |.if FPU + | lfd f1, FORL_IDX*8(RA) ++ |.else ++ | lwz CARG1, FORL_IDX*8(RA) ++ | lwz CARG2, FORL_IDX*8+4(RA) ++ |.endif + | crand 4*cr0+lt, 4*cr0+lt, 4*cr7+lt + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt ++ |.if FPU + | lfd f2, FORL_STOP*8(RA) ++ |.else ++ | lwz CARG3, FORL_STOP*8(RA) ++ | lwz CARG4, FORL_STOP*8+4(RA) ++ |.endif + | bge ->vmeta_for + } +- | cmpwi cr6, TMP3, 0 ++ | cmpwi cr6, SAVE0, 0 + if (op != BC_JFORL) { + | srwi RD, RD, 1 + } ++ |.if FPU + | stfd f1, FORL_EXT*8(RA) ++ |.else ++ | stw CARG1, FORL_EXT*8(RA) ++ | stw CARG2, FORL_EXT*8+4(RA) ++ |.endif + if (op != BC_JFORL) { + | add RD, PC, RD + } ++ |.if FPU + | fcmpu cr0, f1, f2 ++ |.else ++ | mr SAVE1, RD ++ | blex __ledf2 ++ | cmpwi CRET1, 0 ++ | mr RD, SAVE1 ++ |.endif + if (op == BC_JFORI) { + | addis PC, RD, -(BCBIAS_J*4 >> 16) + } diff --git a/lang/luajit/patches/050-ppc-softfloat.patch b/lang/luajit/patches/050-ppc-softfloat.patch new file mode 100644 index 0000000000..68215bbf12 --- /dev/null +++ b/lang/luajit/patches/050-ppc-softfloat.patch @@ -0,0 +1,744 @@ +From 71b7bc88341945f13f3951e2bb5fd247b639ff7a Mon Sep 17 00:00:00 2001 +From: Mike Pall +Date: Sun, 3 Sep 2017 23:20:53 +0200 +Subject: [PATCH] PPC: Add soft-float support to JIT compiler backend. + +Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +Sponsored by Cisco Systems, Inc. +--- + src/lj_arch.h | 1 - + src/lj_asm_ppc.h | 321 ++++++++++++++++++++++++++++++++++++++++------- + 2 files changed, 278 insertions(+), 44 deletions(-) + +--- a/src/lj_arch.h ++++ b/src/lj_arch.h +@@ -273,7 +273,6 @@ + #endif + + #if LJ_ABI_SOFTFP +-#define LJ_ARCH_NOJIT 1 /* NYI */ + #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL + #else + #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE +--- a/src/lj_asm_ppc.h ++++ b/src/lj_asm_ppc.h +@@ -226,6 +226,7 @@ static void asm_fusexrefx(ASMState *as, + emit_tab(as, pi, rt, left, right); + } + ++#if !LJ_SOFTFP + /* Fuse to multiply-add/sub instruction. */ + static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir) + { +@@ -245,6 +246,7 @@ static int asm_fusemadd(ASMState *as, IR + } + return 0; + } ++#endif + + /* -- Calls --------------------------------------------------------------- */ + +@@ -253,13 +255,17 @@ static void asm_gencall(ASMState *as, co + { + uint32_t n, nargs = CCI_XNARGS(ci); + int32_t ofs = 8; +- Reg gpr = REGARG_FIRSTGPR, fpr = REGARG_FIRSTFPR; ++ Reg gpr = REGARG_FIRSTGPR; ++#if !LJ_SOFTFP ++ Reg fpr = REGARG_FIRSTFPR; ++#endif + if ((void *)ci->func) + emit_call(as, (void *)ci->func); + for (n = 0; n < nargs; n++) { /* Setup args. */ + IRRef ref = args[n]; + if (ref) { + IRIns *ir = IR(ref); ++#if !LJ_SOFTFP + if (irt_isfp(ir->t)) { + if (fpr <= REGARG_LASTFPR) { + lua_assert(rset_test(as->freeset, fpr)); /* Already evicted. */ +@@ -271,7 +277,9 @@ static void asm_gencall(ASMState *as, co + emit_spstore(as, ir, r, ofs); + ofs += irt_isnum(ir->t) ? 8 : 4; + } +- } else { ++ } else ++#endif ++ { + if (gpr <= REGARG_LASTGPR) { + lua_assert(rset_test(as->freeset, gpr)); /* Already evicted. */ + ra_leftov(as, gpr, ref); +@@ -290,8 +298,10 @@ static void asm_gencall(ASMState *as, co + } + checkmclim(as); + } ++#if !LJ_SOFTFP + if ((ci->flags & CCI_VARARG)) /* Vararg calls need to know about FPR use. */ + emit_tab(as, fpr == REGARG_FIRSTFPR ? PPCI_CRXOR : PPCI_CREQV, 6, 6, 6); ++#endif + } + + /* Setup result reg/sp for call. Evict scratch regs. */ +@@ -299,8 +309,10 @@ static void asm_setupresult(ASMState *as + { + RegSet drop = RSET_SCRATCH; + int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t)); ++#if !LJ_SOFTFP + if ((ci->flags & CCI_NOFPRCLOBBER)) + drop &= ~RSET_FPR; ++#endif + if (ra_hasreg(ir->r)) + rset_clear(drop, ir->r); /* Dest reg handled below. */ + if (hiop && ra_hasreg((ir+1)->r)) +@@ -308,7 +320,7 @@ static void asm_setupresult(ASMState *as + ra_evictset(as, drop); /* Evictions must be performed first. */ + if (ra_used(ir)) { + lua_assert(!irt_ispri(ir->t)); +- if (irt_isfp(ir->t)) { ++ if (!LJ_SOFTFP && irt_isfp(ir->t)) { + if ((ci->flags & CCI_CASTU64)) { + /* Use spill slot or temp slots. */ + int32_t ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP; +@@ -377,6 +389,7 @@ static void asm_retf(ASMState *as, IRIns + + /* -- Type conversions ---------------------------------------------------- */ + ++#if !LJ_SOFTFP + static void asm_tointg(ASMState *as, IRIns *ir, Reg left) + { + RegSet allow = RSET_FPR; +@@ -409,15 +422,23 @@ static void asm_tobit(ASMState *as, IRIn + emit_fai(as, PPCI_STFD, tmp, RID_SP, SPOFS_TMP); + emit_fab(as, PPCI_FADD, tmp, left, right); + } ++#endif + + static void asm_conv(ASMState *as, IRIns *ir) + { + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); ++#if !LJ_SOFTFP + int stfp = (st == IRT_NUM || st == IRT_FLOAT); ++#endif + IRRef lref = ir->op1; +- lua_assert(irt_type(ir->t) != st); + lua_assert(!(irt_isint64(ir->t) || + (st == IRT_I64 || st == IRT_U64))); /* Handled by SPLIT. */ ++#if LJ_SOFTFP ++ /* FP conversions are handled by SPLIT. */ ++ lua_assert(!irt_isfp(ir->t) && !(st == IRT_NUM || st == IRT_FLOAT)); ++ /* Can't check for same types: SPLIT uses CONV int.int + BXOR for sfp NEG. */ ++#else ++ lua_assert(irt_type(ir->t) != st); + if (irt_isfp(ir->t)) { + Reg dest = ra_dest(as, ir, RSET_FPR); + if (stfp) { /* FP to FP conversion. */ +@@ -476,7 +497,9 @@ static void asm_conv(ASMState *as, IRIns + emit_fb(as, PPCI_FCTIWZ, tmp, left); + } + } +- } else { ++ } else ++#endif ++ { + Reg dest = ra_dest(as, ir, RSET_GPR); + if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ + Reg left = ra_alloc1(as, ir->op1, RSET_GPR); +@@ -496,17 +519,41 @@ static void asm_strto(ASMState *as, IRIn + { + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; + IRRef args[2]; +- int32_t ofs; ++ int32_t ofs = SPOFS_TMP; ++#if LJ_SOFTFP ++ ra_evictset(as, RSET_SCRATCH); ++ if (ra_used(ir)) { ++ if (ra_hasspill(ir->s) && ra_hasspill((ir+1)->s) && ++ (ir->s & 1) == LJ_BE && (ir->s ^ 1) == (ir+1)->s) { ++ int i; ++ for (i = 0; i < 2; i++) { ++ Reg r = (ir+i)->r; ++ if (ra_hasreg(r)) { ++ ra_free(as, r); ++ ra_modified(as, r); ++ emit_spload(as, ir+i, r, sps_scale((ir+i)->s)); ++ } ++ } ++ ofs = sps_scale(ir->s & ~1); ++ } else { ++ Reg rhi = ra_dest(as, ir+1, RSET_GPR); ++ Reg rlo = ra_dest(as, ir, rset_exclude(RSET_GPR, rhi)); ++ emit_tai(as, PPCI_LWZ, rhi, RID_SP, ofs); ++ emit_tai(as, PPCI_LWZ, rlo, RID_SP, ofs+4); ++ } ++ } ++#else + RegSet drop = RSET_SCRATCH; + if (ra_hasreg(ir->r)) rset_set(drop, ir->r); /* Spill dest reg (if any). */ + ra_evictset(as, drop); ++ if (ir->s) ofs = sps_scale(ir->s); ++#endif + asm_guardcc(as, CC_EQ); + emit_ai(as, PPCI_CMPWI, RID_RET, 0); /* Test return status. */ + args[0] = ir->op1; /* GCstr *str */ + args[1] = ASMREF_TMP1; /* TValue *n */ + asm_gencall(as, ci, args); + /* Store the result to the spill slot or temp slots. */ +- ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP; + emit_tai(as, PPCI_ADDI, ra_releasetmp(as, ASMREF_TMP1), RID_SP, ofs); + } + +@@ -530,7 +577,10 @@ static void asm_tvptr(ASMState *as, Reg + Reg src = ra_alloc1(as, ref, allow); + emit_setgl(as, src, tmptv.gcr); + } +- type = ra_allock(as, irt_toitype(ir->t), allow); ++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) ++ type = ra_alloc1(as, ref+1, allow); ++ else ++ type = ra_allock(as, irt_toitype(ir->t), allow); + emit_setgl(as, type, tmptv.it); + } + } +@@ -574,11 +624,27 @@ static void asm_href(ASMState *as, IRIns + Reg tisnum = RID_NONE, tmpnum = RID_NONE; + IRRef refkey = ir->op2; + IRIns *irkey = IR(refkey); ++ int isk = irref_isk(refkey); + IRType1 kt = irkey->t; + uint32_t khash; + MCLabel l_end, l_loop, l_next; + + rset_clear(allow, tab); ++#if LJ_SOFTFP ++ if (!isk) { ++ key = ra_alloc1(as, refkey, allow); ++ rset_clear(allow, key); ++ if (irkey[1].o == IR_HIOP) { ++ if (ra_hasreg((irkey+1)->r)) { ++ tmpnum = (irkey+1)->r; ++ ra_noweak(as, tmpnum); ++ } else { ++ tmpnum = ra_allocref(as, refkey+1, allow); ++ } ++ rset_clear(allow, tmpnum); ++ } ++ } ++#else + if (irt_isnum(kt)) { + key = ra_alloc1(as, refkey, RSET_FPR); + tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key)); +@@ -588,6 +654,7 @@ static void asm_href(ASMState *as, IRIns + key = ra_alloc1(as, refkey, allow); + rset_clear(allow, key); + } ++#endif + tmp2 = ra_scratch(as, allow); + rset_clear(allow, tmp2); + +@@ -610,7 +677,7 @@ static void asm_href(ASMState *as, IRIns + asm_guardcc(as, CC_EQ); + else + emit_condbranch(as, PPCI_BC|PPCF_Y, CC_EQ, l_end); +- if (irt_isnum(kt)) { ++ if (!LJ_SOFTFP && irt_isnum(kt)) { + emit_fab(as, PPCI_FCMPU, 0, tmpnum, key); + emit_condbranch(as, PPCI_BC, CC_GE, l_next); + emit_ab(as, PPCI_CMPLW, tmp1, tisnum); +@@ -620,7 +687,10 @@ static void asm_href(ASMState *as, IRIns + emit_ab(as, PPCI_CMPW, tmp2, key); + emit_condbranch(as, PPCI_BC, CC_NE, l_next); + } +- emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t)); ++ if (LJ_SOFTFP && ra_hasreg(tmpnum)) ++ emit_ab(as, PPCI_CMPW, tmp1, tmpnum); ++ else ++ emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t)); + if (!irt_ispri(kt)) + emit_tai(as, PPCI_LWZ, tmp2, dest, (int32_t)offsetof(Node, key.gcr)); + } +@@ -629,19 +699,19 @@ static void asm_href(ASMState *as, IRIns + (((char *)as->mcp-(char *)l_loop) & 0xffffu); + + /* Load main position relative to tab->node into dest. */ +- khash = irref_isk(refkey) ? ir_khash(irkey) : 1; ++ khash = isk ? ir_khash(irkey) : 1; + if (khash == 0) { + emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node)); + } else { + Reg tmphash = tmp1; +- if (irref_isk(refkey)) ++ if (isk) + tmphash = ra_allock(as, khash, allow); + emit_tab(as, PPCI_ADD, dest, dest, tmp1); + emit_tai(as, PPCI_MULLI, tmp1, tmp1, sizeof(Node)); + emit_asb(as, PPCI_AND, tmp1, tmp2, tmphash); + emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node)); + emit_tai(as, PPCI_LWZ, tmp2, tab, (int32_t)offsetof(GCtab, hmask)); +- if (irref_isk(refkey)) { ++ if (isk) { + /* Nothing to do. */ + } else if (irt_isstr(kt)) { + emit_tai(as, PPCI_LWZ, tmp1, key, (int32_t)offsetof(GCstr, hash)); +@@ -651,13 +721,19 @@ static void asm_href(ASMState *as, IRIns + emit_asb(as, PPCI_XOR, tmp1, tmp1, tmp2); + emit_rotlwi(as, tmp1, tmp1, (HASH_ROT2+HASH_ROT1)&31); + emit_tab(as, PPCI_SUBF, tmp2, dest, tmp2); +- if (irt_isnum(kt)) { ++ if (LJ_SOFTFP ? (irkey[1].o == IR_HIOP) : irt_isnum(kt)) { ++#if LJ_SOFTFP ++ emit_asb(as, PPCI_XOR, tmp2, key, tmp1); ++ emit_rotlwi(as, dest, tmp1, HASH_ROT1); ++ emit_tab(as, PPCI_ADD, tmp1, tmpnum, tmpnum); ++#else + int32_t ofs = ra_spill(as, irkey); + emit_asb(as, PPCI_XOR, tmp2, tmp2, tmp1); + emit_rotlwi(as, dest, tmp1, HASH_ROT1); + emit_tab(as, PPCI_ADD, tmp1, tmp1, tmp1); + emit_tai(as, PPCI_LWZ, tmp2, RID_SP, ofs+4); + emit_tai(as, PPCI_LWZ, tmp1, RID_SP, ofs); ++#endif + } else { + emit_asb(as, PPCI_XOR, tmp2, key, tmp1); + emit_rotlwi(as, dest, tmp1, HASH_ROT1); +@@ -784,8 +860,8 @@ static PPCIns asm_fxloadins(IRIns *ir) + case IRT_U8: return PPCI_LBZ; + case IRT_I16: return PPCI_LHA; + case IRT_U16: return PPCI_LHZ; +- case IRT_NUM: return PPCI_LFD; +- case IRT_FLOAT: return PPCI_LFS; ++ case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_LFD; ++ case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_LFS; + default: return PPCI_LWZ; + } + } +@@ -795,8 +871,8 @@ static PPCIns asm_fxstoreins(IRIns *ir) + switch (irt_type(ir->t)) { + case IRT_I8: case IRT_U8: return PPCI_STB; + case IRT_I16: case IRT_U16: return PPCI_STH; +- case IRT_NUM: return PPCI_STFD; +- case IRT_FLOAT: return PPCI_STFS; ++ case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_STFD; ++ case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_STFS; + default: return PPCI_STW; + } + } +@@ -839,7 +915,8 @@ static void asm_fstore(ASMState *as, IRI + + static void asm_xload(ASMState *as, IRIns *ir) + { +- Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); ++ Reg dest = ra_dest(as, ir, ++ (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR); + lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED)); + if (irt_isi8(ir->t)) + emit_as(as, PPCI_EXTSB, dest, dest); +@@ -857,7 +934,8 @@ static void asm_xstore_(ASMState *as, IR + Reg src = ra_alloc1(as, irb->op1, RSET_GPR); + asm_fusexrefx(as, PPCI_STWBRX, src, ir->op1, rset_exclude(RSET_GPR, src)); + } else { +- Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); ++ Reg src = ra_alloc1(as, ir->op2, ++ (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR); + asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, + rset_exclude(RSET_GPR, src), ofs); + } +@@ -871,10 +949,19 @@ static void asm_ahuvload(ASMState *as, I + Reg dest = RID_NONE, type = RID_TMP, tmp = RID_TMP, idx; + RegSet allow = RSET_GPR; + int32_t ofs = AHUREF_LSX; ++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) { ++ t.irt = IRT_NUM; ++ if (ra_used(ir+1)) { ++ type = ra_dest(as, ir+1, allow); ++ rset_clear(allow, type); ++ } ++ ofs = 0; ++ } + if (ra_used(ir)) { +- lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); +- if (!irt_isnum(t)) ofs = 0; +- dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR); ++ lua_assert((LJ_SOFTFP ? 0 : irt_isnum(ir->t)) || ++ irt_isint(ir->t) || irt_isaddr(ir->t)); ++ if (LJ_SOFTFP || !irt_isnum(t)) ofs = 0; ++ dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); + rset_clear(allow, dest); + } + idx = asm_fuseahuref(as, ir->op1, &ofs, allow); +@@ -883,12 +970,13 @@ static void asm_ahuvload(ASMState *as, I + asm_guardcc(as, CC_GE); + emit_ab(as, PPCI_CMPLW, type, tisnum); + if (ra_hasreg(dest)) { +- if (ofs == AHUREF_LSX) { ++ if (!LJ_SOFTFP && ofs == AHUREF_LSX) { + tmp = ra_scratch(as, rset_exclude(rset_exclude(RSET_GPR, + (idx&255)), (idx>>8))); + emit_fab(as, PPCI_LFDX, dest, (idx&255), tmp); + } else { +- emit_fai(as, PPCI_LFD, dest, idx, ofs); ++ emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest, idx, ++ ofs+4*LJ_SOFTFP); + } + } + } else { +@@ -911,7 +999,7 @@ static void asm_ahustore(ASMState *as, I + int32_t ofs = AHUREF_LSX; + if (ir->r == RID_SINK) + return; +- if (irt_isnum(ir->t)) { ++ if (!LJ_SOFTFP && irt_isnum(ir->t)) { + src = ra_alloc1(as, ir->op2, RSET_FPR); + } else { + if (!irt_ispri(ir->t)) { +@@ -919,11 +1007,14 @@ static void asm_ahustore(ASMState *as, I + rset_clear(allow, src); + ofs = 0; + } +- type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); ++ if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) ++ type = ra_alloc1(as, (ir+1)->op2, allow); ++ else ++ type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + rset_clear(allow, type); + } + idx = asm_fuseahuref(as, ir->op1, &ofs, allow); +- if (irt_isnum(ir->t)) { ++ if (!LJ_SOFTFP && irt_isnum(ir->t)) { + if (ofs == AHUREF_LSX) { + emit_fab(as, PPCI_STFDX, src, (idx&255), RID_TMP); + emit_slwi(as, RID_TMP, (idx>>8), 3); +@@ -948,21 +1039,33 @@ static void asm_sload(ASMState *as, IRIn + IRType1 t = ir->t; + Reg dest = RID_NONE, type = RID_NONE, base; + RegSet allow = RSET_GPR; ++ int hiop = (LJ_SOFTFP && (ir+1)->o == IR_HIOP); ++ if (hiop) ++ t.irt = IRT_NUM; + lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ +- lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK)); ++ lua_assert(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK)); + lua_assert(LJ_DUALNUM || + !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME))); ++#if LJ_SOFTFP ++ lua_assert(!(ir->op2 & IRSLOAD_CONVERT)); /* Handled by LJ_SOFTFP SPLIT. */ ++ if (hiop && ra_used(ir+1)) { ++ type = ra_dest(as, ir+1, allow); ++ rset_clear(allow, type); ++ } ++#else + if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { + dest = ra_scratch(as, RSET_FPR); + asm_tointg(as, ir, dest); + t.irt = IRT_NUM; /* Continue with a regular number type check. */ +- } else if (ra_used(ir)) { ++ } else ++#endif ++ if (ra_used(ir)) { + lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); +- dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR); ++ dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); + rset_clear(allow, dest); + base = ra_alloc1(as, REF_BASE, allow); + rset_clear(allow, base); +- if ((ir->op2 & IRSLOAD_CONVERT)) { ++ if (!LJ_SOFTFP && (ir->op2 & IRSLOAD_CONVERT)) { + if (irt_isint(t)) { + emit_tai(as, PPCI_LWZ, dest, RID_SP, SPOFS_TMPLO); + dest = ra_scratch(as, RSET_FPR); +@@ -994,10 +1097,13 @@ dotypecheck: + if ((ir->op2 & IRSLOAD_TYPECHECK)) { + Reg tisnum = ra_allock(as, (int32_t)LJ_TISNUM, allow); + asm_guardcc(as, CC_GE); +- emit_ab(as, PPCI_CMPLW, RID_TMP, tisnum); ++#if !LJ_SOFTFP + type = RID_TMP; ++#endif ++ emit_ab(as, PPCI_CMPLW, type, tisnum); + } +- if (ra_hasreg(dest)) emit_fai(as, PPCI_LFD, dest, base, ofs-4); ++ if (ra_hasreg(dest)) emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest, ++ base, ofs-(LJ_SOFTFP?0:4)); + } else { + if ((ir->op2 & IRSLOAD_TYPECHECK)) { + asm_guardcc(as, CC_NE); +@@ -1119,6 +1225,7 @@ static void asm_obar(ASMState *as, IRIns + + /* -- Arithmetic and logic operations ------------------------------------- */ + ++#if !LJ_SOFTFP + static void asm_fparith(ASMState *as, IRIns *ir, PPCIns pi) + { + Reg dest = ra_dest(as, ir, RSET_FPR); +@@ -1146,13 +1253,17 @@ static void asm_fpmath(ASMState *as, IRI + else + asm_callid(as, ir, IRCALL_lj_vm_floor + ir->op2); + } ++#endif + + static void asm_add(ASMState *as, IRIns *ir) + { ++#if !LJ_SOFTFP + if (irt_isnum(ir->t)) { + if (!asm_fusemadd(as, ir, PPCI_FMADD, PPCI_FMADD)) + asm_fparith(as, ir, PPCI_FADD); +- } else { ++ } else ++#endif ++ { + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); + PPCIns pi; +@@ -1191,10 +1302,13 @@ static void asm_add(ASMState *as, IRIns + + static void asm_sub(ASMState *as, IRIns *ir) + { ++#if !LJ_SOFTFP + if (irt_isnum(ir->t)) { + if (!asm_fusemadd(as, ir, PPCI_FMSUB, PPCI_FNMSUB)) + asm_fparith(as, ir, PPCI_FSUB); +- } else { ++ } else ++#endif ++ { + PPCIns pi = PPCI_SUBF; + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left, right; +@@ -1220,9 +1334,12 @@ static void asm_sub(ASMState *as, IRIns + + static void asm_mul(ASMState *as, IRIns *ir) + { ++#if !LJ_SOFTFP + if (irt_isnum(ir->t)) { + asm_fparith(as, ir, PPCI_FMUL); +- } else { ++ } else ++#endif ++ { + PPCIns pi = PPCI_MULLW; + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); +@@ -1250,9 +1367,12 @@ static void asm_mul(ASMState *as, IRIns + + static void asm_neg(ASMState *as, IRIns *ir) + { ++#if !LJ_SOFTFP + if (irt_isnum(ir->t)) { + asm_fpunary(as, ir, PPCI_FNEG); +- } else { ++ } else ++#endif ++ { + Reg dest, left; + PPCIns pi = PPCI_NEG; + if (as->flagmcp == as->mcp) { +@@ -1563,9 +1683,40 @@ static void asm_bitshift(ASMState *as, I + PPCI_RLWINM|PPCF_MB(0)|PPCF_ME(31)) + #define asm_bror(as, ir) lua_assert(0) + ++#if LJ_SOFTFP ++static void asm_sfpmin_max(ASMState *as, IRIns *ir) ++{ ++ CCallInfo ci = lj_ir_callinfo[IRCALL_softfp_cmp]; ++ IRRef args[4]; ++ MCLabel l_right, l_end; ++ Reg desthi = ra_dest(as, ir, RSET_GPR), destlo = ra_dest(as, ir+1, RSET_GPR); ++ Reg righthi, lefthi = ra_alloc2(as, ir, RSET_GPR); ++ Reg rightlo, leftlo = ra_alloc2(as, ir+1, RSET_GPR); ++ PPCCC cond = (IROp)ir->o == IR_MIN ? CC_EQ : CC_NE; ++ righthi = (lefthi >> 8); lefthi &= 255; ++ rightlo = (leftlo >> 8); leftlo &= 255; ++ args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1; ++ args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2; ++ l_end = emit_label(as); ++ if (desthi != righthi) emit_mr(as, desthi, righthi); ++ if (destlo != rightlo) emit_mr(as, destlo, rightlo); ++ l_right = emit_label(as); ++ if (l_end != l_right) emit_jmp(as, l_end); ++ if (desthi != lefthi) emit_mr(as, desthi, lefthi); ++ if (destlo != leftlo) emit_mr(as, destlo, leftlo); ++ if (l_right == as->mcp+1) { ++ cond ^= 4; l_right = l_end; ++as->mcp; ++ } ++ emit_condbranch(as, PPCI_BC, cond, l_right); ++ ra_evictset(as, RSET_SCRATCH); ++ emit_cmpi(as, RID_RET, 1); ++ asm_gencall(as, &ci, args); ++} ++#endif ++ + static void asm_min_max(ASMState *as, IRIns *ir, int ismax) + { +- if (irt_isnum(ir->t)) { ++ if (!LJ_SOFTFP && irt_isnum(ir->t)) { + Reg dest = ra_dest(as, ir, RSET_FPR); + Reg tmp = dest; + Reg right, left = ra_alloc2(as, ir, RSET_FPR); +@@ -1653,7 +1804,7 @@ static void asm_intcomp_(ASMState *as, I + static void asm_comp(ASMState *as, IRIns *ir) + { + PPCCC cc = asm_compmap[ir->o]; +- if (irt_isnum(ir->t)) { ++ if (!LJ_SOFTFP && irt_isnum(ir->t)) { + Reg right, left = ra_alloc2(as, ir, RSET_FPR); + right = (left >> 8); left &= 255; + asm_guardcc(as, (cc >> 4)); +@@ -1674,6 +1825,44 @@ static void asm_comp(ASMState *as, IRIns + + #define asm_equal(as, ir) asm_comp(as, ir) + ++#if LJ_SOFTFP ++/* SFP comparisons. */ ++static void asm_sfpcomp(ASMState *as, IRIns *ir) ++{ ++ const CCallInfo *ci = &lj_ir_callinfo[IRCALL_softfp_cmp]; ++ RegSet drop = RSET_SCRATCH; ++ Reg r; ++ IRRef args[4]; ++ args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1; ++ args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2; ++ ++ for (r = REGARG_FIRSTGPR; r <= REGARG_FIRSTGPR+3; r++) { ++ if (!rset_test(as->freeset, r) && ++ regcost_ref(as->cost[r]) == args[r-REGARG_FIRSTGPR]) ++ rset_clear(drop, r); ++ } ++ ra_evictset(as, drop); ++ asm_setupresult(as, ir, ci); ++ switch ((IROp)ir->o) { ++ case IR_ULT: ++ asm_guardcc(as, CC_EQ); ++ emit_ai(as, PPCI_CMPWI, RID_RET, 0); ++ case IR_ULE: ++ asm_guardcc(as, CC_EQ); ++ emit_ai(as, PPCI_CMPWI, RID_RET, 1); ++ break; ++ case IR_GE: case IR_GT: ++ asm_guardcc(as, CC_EQ); ++ emit_ai(as, PPCI_CMPWI, RID_RET, 2); ++ default: ++ asm_guardcc(as, (asm_compmap[ir->o] & 0xf)); ++ emit_ai(as, PPCI_CMPWI, RID_RET, 0); ++ break; ++ } ++ asm_gencall(as, ci, args); ++} ++#endif ++ + #if LJ_HASFFI + /* 64 bit integer comparisons. */ + static void asm_comp64(ASMState *as, IRIns *ir) +@@ -1703,19 +1892,36 @@ static void asm_comp64(ASMState *as, IRI + /* Hiword op of a split 64 bit op. Previous op must be the loword op. */ + static void asm_hiop(ASMState *as, IRIns *ir) + { +-#if LJ_HASFFI ++#if LJ_HASFFI || LJ_SOFTFP + /* HIOP is marked as a store because it needs its own DCE logic. */ + int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ + if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; + if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ + as->curins--; /* Always skip the CONV. */ ++#if LJ_HASFFI && !LJ_SOFTFP + if (usehi || uselo) + asm_conv64(as, ir); + return; ++#endif + } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ + as->curins--; /* Always skip the loword comparison. */ ++#if LJ_SOFTFP ++ if (!irt_isint(ir->t)) { ++ asm_sfpcomp(as, ir-1); ++ return; ++ } ++#endif ++#if LJ_HASFFI + asm_comp64(as, ir); ++#endif ++ return; ++#if LJ_SOFTFP ++ } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) { ++ as->curins--; /* Always skip the loword min/max. */ ++ if (uselo || usehi) ++ asm_sfpmin_max(as, ir-1); + return; ++#endif + } else if ((ir-1)->o == IR_XSTORE) { + as->curins--; /* Handle both stores here. */ + if ((ir-1)->r != RID_SINK) { +@@ -1726,14 +1932,27 @@ static void asm_hiop(ASMState *as, IRIns + } + if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ + switch ((ir-1)->o) { ++#if LJ_HASFFI + case IR_ADD: as->curins--; asm_add64(as, ir); break; + case IR_SUB: as->curins--; asm_sub64(as, ir); break; + case IR_NEG: as->curins--; asm_neg64(as, ir); break; ++#endif ++#if LJ_SOFTFP ++ case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD: ++ case IR_STRTO: ++ if (!uselo) ++ ra_allocref(as, ir->op1, RSET_GPR); /* Mark lo op as used. */ ++ break; ++#endif + case IR_CALLN: ++ case IR_CALLS: + case IR_CALLXS: + if (!uselo) + ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */ + break; ++#if LJ_SOFTFP ++ case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR: ++#endif + case IR_CNEWI: + /* Nothing to do here. Handled by lo op itself. */ + break; +@@ -1797,8 +2016,19 @@ static void asm_stack_restore(ASMState * + if ((sn & SNAP_NORESTORE)) + continue; + if (irt_isnum(ir->t)) { ++#if LJ_SOFTFP ++ Reg tmp; ++ RegSet allow = rset_exclude(RSET_GPR, RID_BASE); ++ lua_assert(irref_isk(ref)); /* LJ_SOFTFP: must be a number constant. */ ++ tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, allow); ++ emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?4:0)); ++ if (rset_test(as->freeset, tmp+1)) allow = RID2RSET(tmp+1); ++ tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, allow); ++ emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?0:4)); ++#else + Reg src = ra_alloc1(as, ref, RSET_FPR); + emit_fai(as, PPCI_STFD, src, RID_BASE, ofs); ++#endif + } else { + Reg type; + RegSet allow = rset_exclude(RSET_GPR, RID_BASE); +@@ -1811,6 +2041,10 @@ static void asm_stack_restore(ASMState * + if ((sn & (SNAP_CONT|SNAP_FRAME))) { + if (s == 0) continue; /* Do not overwrite link to previous frame. */ + type = ra_allock(as, (int32_t)(*flinks--), allow); ++#if LJ_SOFTFP ++ } else if ((sn & SNAP_SOFTFPNUM)) { ++ type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE)); ++#endif + } else { + type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + } +@@ -1947,14 +2181,15 @@ static Reg asm_setup_call_slots(ASMState + int nslots = 2, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; + asm_collectargs(as, ir, ci, args); + for (i = 0; i < nargs; i++) +- if (args[i] && irt_isfp(IR(args[i])->t)) { ++ if (!LJ_SOFTFP && args[i] && irt_isfp(IR(args[i])->t)) { + if (nfpr > 0) nfpr--; else nslots = (nslots+3) & ~1; + } else { + if (ngpr > 0) ngpr--; else nslots++; + } + if (nslots > as->evenspill) /* Leave room for args in stack slots. */ + as->evenspill = nslots; +- return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET); ++ return (!LJ_SOFTFP && irt_isfp(ir->t)) ? REGSP_HINT(RID_FPRET) : ++ REGSP_HINT(RID_RET); + } + + static void asm_setup_target(ASMState *as) diff --git a/lang/luajit/patches/060-ppc-musl.patch b/lang/luajit/patches/060-ppc-musl.patch new file mode 100644 index 0000000000..119fedd21b --- /dev/null +++ b/lang/luajit/patches/060-ppc-musl.patch @@ -0,0 +1,112 @@ +From 195e0be62c0aa0f6aaf63a93ee322bb0a630576c Mon Sep 17 00:00:00 2001 +From: Clint Bland +Date: Wed, 13 Mar 2019 19:19:16 -0700 +Subject: [PATCH] Have powerpc use fake GOT like MIPS + +--- + src/lj_dispatch.c | 15 +++++++++++++++ + src/lj_dispatch.h | 29 ++++++++++++++++++++++++++++- + src/vm_ppc.dasc | 9 ++++++++- + 3 files changed, 51 insertions(+), 2 deletions(-) + +--- a/src/lj_dispatch.c ++++ b/src/lj_dispatch.c +@@ -56,6 +56,18 @@ static const ASMFunction dispatch_got[] + #undef GOTFUNC + #endif + ++#if LJ_TARGET_PPC ++#include ++LJ_FUNCA_NORET void LJ_FASTCALL lj_ffh_coroutine_wrap_err(lua_State *L, ++ lua_State *co); ++ ++#define GOTFUNC(name) (ASMFunction)name, ++static const ASMFunction dispatch_got[] = { ++ GOTDEF(GOTFUNC) ++}; ++#undef GOTFUNC ++#endif ++ + /* Initialize instruction dispatch table and hot counters. */ + void lj_dispatch_init(GG_State *GG) + { +@@ -77,6 +89,9 @@ void lj_dispatch_init(GG_State *GG) + #if LJ_TARGET_MIPS + memcpy(GG->got, dispatch_got, LJ_GOT__MAX*sizeof(ASMFunction *)); + #endif ++#if LJ_TARGET_PPC ++ memcpy(GG->got, dispatch_got, LJ_GOT__MAX*4); ++#endif + } + + #if LJ_HASJIT +--- a/src/lj_dispatch.h ++++ b/src/lj_dispatch.h +@@ -66,6 +66,33 @@ GOTDEF(GOTENUM) + }; + #endif + ++#if LJ_TARGET_PPC ++/* Need our own global offset table for the dreaded MIPS calling conventions. */ ++#if LJ_SOFTFP ++#ifndef _LJ_IRCALL_H ++extern double __ledf2(double a, double b); ++extern double __adddf3(double a, double b); ++extern double __subdf3(double a, double b); ++extern double __muldf3(double a, double b); ++extern double __divdf3(double a, double b); ++#endif ++#define SFGOTDEF(_) _(__ledf2) _(__adddf3) _(__subdf3) _(__muldf3) _(__divdf3) ++#else ++#define SFGOTDEF(_) ++#endif ++#define GOTDEF(_) \ ++ _(floor) _(ceil) _(trunc) _(log) _(log10) _(exp) _(sin) _(cos) _(tan) \ ++ _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \ ++ _(pow) _(fmod) _(ldexp) _(sqrt) SFGOTDEF(_) ++ ++enum { ++#define GOTENUM(name) LJ_GOT_##name, ++GOTDEF(GOTENUM) ++#undef GOTENUM ++ LJ_GOT__MAX ++}; ++#endif ++ + /* Type of hot counter. Must match the code in the assembler VM. */ + /* 16 bits are sufficient. Only 0.0015% overhead with maximum slot penalty. */ + typedef uint16_t HotCount; +@@ -89,7 +116,7 @@ typedef uint16_t HotCount; + typedef struct GG_State { + lua_State L; /* Main thread. */ + global_State g; /* Global state. */ +-#if LJ_TARGET_MIPS ++#if LJ_TARGET_MIPS || LJ_TARGET_PPC + ASMFunction got[LJ_GOT__MAX]; /* Global offset table. */ + #endif + #if LJ_HASJIT +--- a/src/vm_ppc.dasc ++++ b/src/vm_ppc.dasc +@@ -59,7 +59,12 @@ + |.define ENV_OFS, 8 + |.endif + |.else // No TOC. +-|.macro blex, target; bl extern target@plt; .endmacro ++|.macro blex, target ++| lwz TMP0, DISPATCH_GOT(target)(DISPATCH) ++| mtctr TMP0 ++| bctrl ++| //bl extern target@plt ++|.endmacro + |.macro .toc, a, b; .endmacro + |.endif + |.macro .tocenv, a, b; .if TOCENV; a, b; .endif; .endmacro +@@ -482,6 +487,8 @@ + |// Assumes DISPATCH is relative to GL. + #define DISPATCH_GL(field) (GG_DISP2G + (int)offsetof(global_State, field)) + #define DISPATCH_J(field) (GG_DISP2J + (int)offsetof(jit_State, field)) ++#define GG_DISP2GOT (GG_OFS(got) - GG_OFS(dispatch)) ++#define DISPATCH_GOT(name) (GG_DISP2GOT + 4*LJ_GOT_##name) + | + #define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto)) + |