Make codebase more CPU-agnostic + MacOS disclaimer (#559)

This commit is contained in:
Exzap 2022-12-07 00:48:24 +00:00 committed by GitHub
parent 445b0afa95
commit 2c81d240a5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 416 additions and 272 deletions

View file

@ -2,10 +2,6 @@
project ("ih264d")
set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86")
include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES})
add_library (ih264d
"common/ih264_buf_mgr.c"
"common/ih264_buf_mgr.h"
@ -53,21 +49,6 @@ add_library (ih264d
"common/ih264_weighted_pred.h"
"common/ithread.c"
"common/ithread.h"
"common/x86/ih264_chroma_intra_pred_filters_ssse3.c"
"common/x86/ih264_deblk_chroma_ssse3.c"
"common/x86/ih264_deblk_luma_ssse3.c"
"common/x86/ih264_ihadamard_scaling_sse42.c"
"common/x86/ih264_ihadamard_scaling_ssse3.c"
"common/x86/ih264_inter_pred_filters_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_sse42.c"
"common/x86/ih264_iquant_itrans_recon_ssse3.c"
"common/x86/ih264_luma_intra_pred_filters_ssse3.c"
"common/x86/ih264_mem_fns_ssse3.c"
"common/x86/ih264_padding_ssse3.c"
"common/x86/ih264_platform_macros.h"
"common/x86/ih264_resi_trans_quant_sse42.c"
"common/x86/ih264_weighted_pred_sse42.c"
"decoder/ih264d.h"
"decoder/ih264d_api.c"
"decoder/ih264d_bitstrm.c"
@ -134,10 +115,71 @@ add_library (ih264d
"decoder/ih264d_vui.h"
"decoder/iv.h"
"decoder/ivd.h"
)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86")
include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES})
target_sources(ih264d PRIVATE
"common/x86/ih264_chroma_intra_pred_filters_ssse3.c"
"common/x86/ih264_deblk_chroma_ssse3.c"
"common/x86/ih264_deblk_luma_ssse3.c"
"common/x86/ih264_ihadamard_scaling_sse42.c"
"common/x86/ih264_ihadamard_scaling_ssse3.c"
"common/x86/ih264_inter_pred_filters_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_sse42.c"
"common/x86/ih264_iquant_itrans_recon_ssse3.c"
"common/x86/ih264_luma_intra_pred_filters_ssse3.c"
"common/x86/ih264_mem_fns_ssse3.c"
"common/x86/ih264_padding_ssse3.c"
"common/x86/ih264_platform_macros.h"
"common/x86/ih264_resi_trans_quant_sse42.c"
"common/x86/ih264_weighted_pred_sse42.c"
"decoder/x86/ih264d_function_selector.c"
"decoder/x86/ih264d_function_selector_sse42.c"
"decoder/x86/ih264d_function_selector_ssse3.c"
)
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
enable_language( C CXX ASM )
set(LIBAVCDEC_ARM_INCLUDES "common/armv8" "decoder/arm")
include_directories("common/" "decoder/" ${LIBAVCDEC_ARM_INCLUDES})
target_sources(ih264d PRIVATE
"common/armv8/ih264_deblk_chroma_av8.s"
"common/armv8/ih264_deblk_luma_av8.s"
"common/armv8/ih264_default_weighted_pred_av8.s"
"common/armv8/ih264_ihadamard_scaling_av8.s"
"common/armv8/ih264_inter_pred_chroma_av8.s"
"common/armv8/ih264_inter_pred_filters_luma_horz_av8.s"
"common/armv8/ih264_inter_pred_filters_luma_vert_av8.s"
"common/armv8/ih264_inter_pred_luma_copy_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s"
"common/armv8/ih264_intra_pred_chroma_av8.s"
"common/armv8/ih264_intra_pred_luma_16x16_av8.s"
"common/armv8/ih264_intra_pred_luma_4x4_av8.s"
"common/armv8/ih264_intra_pred_luma_8x8_av8.s"
"common/armv8/ih264_iquant_itrans_recon_av8.s"
"common/armv8/ih264_iquant_itrans_recon_dc_av8.s"
"common/armv8/ih264_mem_fns_neon_av8.s"
"common/armv8/ih264_neon_macros.s"
"common/armv8/ih264_padding_neon_av8.s"
"common/armv8/ih264_platform_macros.h"
"common/armv8/ih264_resi_trans_quant_av8.s"
"common/armv8/ih264_weighted_bi_pred_av8.s"
"common/armv8/ih264_weighted_pred_av8.s"
"decoder/arm/ih264d_function_selector_a9q.c"
"decoder/arm/ih264d_function_selector_av8.c"
"decoder/arm/ih264d_function_selector.c"
)
target_compile_options(ih264d PRIVATE -DARMV8)
else()
message(FATAL_ERROR "ih264d unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
if(MSVC)
set_property(TARGET ih264d PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

View file

@ -83,7 +83,7 @@ void hleExport_xcx_enterCriticalSection(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0);
return;
}
_mm_pause();
_mm_pause();
}
PPCCore_switchToScheduler();
}

View file

@ -203,7 +203,6 @@ extern uint64 ppcMainThreadDECCycleStart; // at which cycle the dec register was
void PPCTimer_init();
void PPCTimer_waitForInit();
uint64 PPCTimer_getFromRDTSC();
bool PPCTimer_hasInvariantRDTSCSupport();
uint64 PPCTimer_microsecondsToTsc(uint64 us);
uint64 PPCTimer_tscToMicroseconds(uint64 us);

View file

@ -1,9 +1,14 @@
#include "Cafe/HW/Espresso/Const.h"
#include <immintrin.h>
#include "asm/x64util.h"
#include "config/ActiveSettings.h"
#include "util/helpers/fspinlock.h"
#include "util/highresolutiontimer/HighResolutionTimer.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64)
#include <immintrin.h>
#pragma intrinsic(__rdtsc)
#endif
uint64 _rdtscLastMeasure = 0;
uint64 _rdtscFrequency = 0;
@ -18,8 +23,6 @@ static_assert(sizeof(uint128_t) == 16);
uint128_t _rdtscAcc{};
#pragma intrinsic(__rdtsc)
uint64 muldiv64(uint64 a, uint64 b, uint64 d)
{
uint64 diva = a / d;
@ -29,17 +32,12 @@ uint64 muldiv64(uint64 a, uint64 b, uint64 d)
return diva * b + moda * divb + moda * modb / d;
}
bool PPCTimer_hasInvariantRDTSCSupport()
{
uint32 cpuv[4];
cpuid((int*)cpuv, 0x80000007);
return ((cpuv[3] >> 8) & 1);
}
uint64 PPCTimer_estimateRDTSCFrequency()
{
if (PPCTimer_hasInvariantRDTSCSupport() == false)
forceLog_printf("Invariant TSC not supported");
#if defined(ARCH_X86_64)
if (!g_CPUFeatures.x86.invariant_tsc)
cemuLog_log(LogType::Force, "Invariant TSC not supported");
#endif
_mm_mfence();
uint64 tscStart = __rdtsc();

View file

@ -8,11 +8,10 @@
#include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h"
#include "config/ActiveSettings.h"
#include "config/LaunchSettings.h"
#include "util/helpers/fspinlock.h"
#include "Common/ExceptionHandler/ExceptionHandler.h"
#include "Common/cpu_features.h"
#include "util/helpers/fspinlock.h"
#include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h"
struct PPCInvalidationRange
@ -461,6 +460,20 @@ void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr)
PPCRecompilerState.recompilerSpinlock.unlock();
}
#if defined(ARCH_X86_64)
void PPCRecompiler_initPlatform()
{
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
}
#else
void PPCRecompiler_initPlatform()
{
}
#endif
void PPCRecompiler_init()
{
if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter)
@ -569,21 +582,9 @@ void PPCRecompiler_init()
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
}
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
// query processor extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0;
hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0;
forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", hasLZCNTSupport ? "LZCNT " : "", hasMOVBESupport ? "MOVBE " : "", hasAVXSupport ? "AVX " : "");
PPCRecompiler_initPlatform();
forceLog_printf("Recompiler initialized");
ppcRecompilerEnabled = true;

View file

@ -384,12 +384,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
#define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1)
// CPUID
extern bool hasLZCNTSupport;
extern bool hasMOVBESupport;
extern bool hasBMI2Support;
extern bool hasAVXSupport;
// todo - move some of the stuff above into PPCRecompilerInternal.h
// recompiler interface

View file

@ -3,14 +3,6 @@
#include "PPCRecompilerIml.h"
#include "Cafe/GameProfile/GameProfile.h"
bool hasSSE1Support = true;
bool hasSSE2Support = true;
bool hasSSE3Support = true;
bool hasLZCNTSupport = false;
bool hasMOVBESupport = false;
bool hasBMI2Support = false;
bool hasAVXSupport = false;
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, uint8 registerDestination, uint8 registerMemory, sint32 immS32, uint32 mode, bool switchEndian, uint8 registerGQR = PPC_REC_INVALID_REGISTER)
{
// load from memory
@ -145,8 +137,6 @@ void PPRecompilerImmGen_optionalRoundPairFPRToSinglePrecision(ppcImlGenContext_t
bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -167,8 +157,6 @@ bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode
bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -191,8 +179,6 @@ bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 )
@ -218,8 +204,6 @@ bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 )
@ -247,8 +231,6 @@ bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -266,8 +248,6 @@ bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode
bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -288,8 +268,6 @@ bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 )
@ -308,8 +286,6 @@ bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 )
@ -330,8 +306,6 @@ bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -346,8 +320,6 @@ bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -364,8 +336,6 @@ bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 )
@ -392,8 +362,6 @@ bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 )
@ -415,8 +383,6 @@ bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opc
bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -435,8 +401,6 @@ bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE1Support == false )
return false;
sint32 rA, frD;
uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -458,8 +422,6 @@ bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 )
@ -485,8 +447,6 @@ bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
// get memory gpr registers
@ -959,10 +919,6 @@ bool PPCRecompilerImlGen_FCMPO(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
sint32 crfD, frA, frB;
PPC_OPC_TEMPL_X(opcode, crfD, frA, frB);
crfD >>= 2;
if( hasSSE2Support == false )
{
return false;
}
uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA);
uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPO_BOTTOM, fprRegisterA, fprRegisterB, crfD);
@ -974,10 +930,6 @@ bool PPCRecompilerImlGen_FCMPU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
sint32 crfD, frA, frB;
PPC_OPC_TEMPL_X(opcode, crfD, frA, frB);
crfD >>= 2;
if( hasSSE2Support == false )
{
return false;
}
uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA);
uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPU_BOTTOM, fprRegisterA, fprRegisterB, crfD);
@ -1120,8 +1072,6 @@ bool PPCRecompilerImlGen_FCTIWZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opc
bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if (hasSSE2Support == false)
return false;
int rA, frD;
uint32 immUnused;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused);
@ -1146,8 +1096,6 @@ bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{
if (hasSSE2Support == false)
return false;
int rA, frD;
uint32 immUnused;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused);

View file

@ -5,8 +5,8 @@
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "util/MemMapper/MemMapper.h"
#include "Common/cpu_features.h"
sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping
{
@ -381,7 +381,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{
x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2);
}
if( hasMOVBESupport && switchEndian )
if( g_CPUFeatures.x86.movbe && switchEndian )
{
if (indexed)
{
@ -419,7 +419,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if( hasMOVBESupport && switchEndian )
if( g_CPUFeatures.x86.movbe && switchEndian )
{
x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
@ -477,7 +477,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
assert_dbg();
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move)
if( hasMOVBESupport )
if( g_CPUFeatures.x86.movbe )
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
@ -537,7 +537,7 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
uint32 valueRegister;
if ((swapEndian == false || hasMOVBESupport) && realRegisterMem != realRegisterData)
if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData)
{
valueRegister = realRegisterData;
}
@ -546,11 +546,11 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
valueRegister = REG_RESV_TEMP;
}
if (hasMOVBESupport == false && swapEndian)
if (g_CPUFeatures.x86.movbe == false && swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (hasMOVBESupport && swapEndian)
if (g_CPUFeatures.x86.movbe && swapEndian)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
@ -802,8 +802,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
// count leading zeros
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER);
// LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5])
if( hasLZCNTSupport )
if( g_CPUFeatures.x86.lzcnt )
{
x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA));
}
@ -1521,12 +1520,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA);
sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB);
if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SRW)
if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW)
{
// use BMI2 SHRX if available
x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
}
else if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SLW)
else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW)
{
// use BMI2 SHLX if available
x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);

View file

@ -2,6 +2,7 @@
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
#include "asm/x64util.h"
#include "Common/cpu_features.h"
void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction)
{
@ -86,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (hasMOVBESupport)
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
}
@ -98,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
}
else
{
if (hasMOVBESupport)
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
}
@ -108,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
if (hasAVXSupport)
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
}
@ -280,21 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem);
if( hasMOVBESupport )
if( g_CPUFeatures.x86.movbe )
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
}
else
{
if( hasMOVBESupport )
if( g_CPUFeatures.x86.movbe )
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
}
if( hasMOVBESupport == false )
if( g_CPUFeatures.x86.movbe == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( hasAVXSupport )
if( g_CPUFeatures.x86.avx )
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP);
}
@ -316,7 +317,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
}
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 )
{
if( hasAVXSupport )
if( g_CPUFeatures.x86.avx )
{
if( indexed )
{
@ -419,7 +420,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
if (hasAVXSupport)
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
@ -428,14 +429,14 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
if (hasMOVBESupport == false)
if (g_CPUFeatures.x86.movbe == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
{
cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
if (hasMOVBESupport)
if (g_CPUFeatures.x86.movbe)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
@ -604,7 +605,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
if (imlInstruction->op_storeLoad.flags2.notExpanded)
{
// value is already in single format
if (hasAVXSupport)
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
@ -617,7 +618,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
else
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM);
if (hasAVXSupport)
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
@ -627,7 +628,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
}
if( hasMOVBESupport == false )
if( g_CPUFeatures.x86.movbe == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed )
{
@ -635,7 +636,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
assert_dbg();
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if( hasMOVBESupport )
if( g_CPUFeatures.x86.movbe )
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
@ -668,7 +669,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
}
else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 )
{
if( hasAVXSupport )
if( g_CPUFeatures.x86.avx )
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
@ -749,7 +750,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// unpack top to bottom and top
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r.registerResult, imlInstruction->op_fpr_r_r.registerOperand);
}
//else if ( hasAVXSupport )
//else if ( g_CPUFeatures.x86.avx )
//{
// // unpack top to bottom and top with non-destructive destination
// // update: On Ivy Bridge this causes weird stalls?
@ -1056,7 +1057,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB);
}
else if (hasAVXSupport)
else if (g_CPUFeatures.x86.avx)
{
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB);
}

View file

@ -1,8 +1,9 @@
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64)
#if __GNUC__
#include <immintrin.h>
#endif
@ -14,6 +15,7 @@
#define ATTRIBUTE_AVX2
#define ATTRIBUTE_SSE41
#endif
#endif
struct
{
@ -292,6 +294,7 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun
indexMax = std::max(count, 1u) - 1;
}
#if defined(ARCH_X86_64)
ATTRIBUTE_AVX2
void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
@ -487,6 +490,7 @@ void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDat
indexMax = std::max(indexMax, _maxIndex);
indexMin = std::min(indexMin, _minIndex);
}
#endif
template<typename T>
void _LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, uint32 count, uint32 primitiveRestartIndex, uint32& indexMin, uint32& indexMax)
@ -669,19 +673,27 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
{
if (indexType == LatteIndexType::U16_BE)
{
if (_cpuExtension_AVX2)
#if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (_cpuExtension_SSE4_1 && _cpuExtension_SSSE3)
else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3)
LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax);
else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
}
else if (indexType == LatteIndexType::U32_BE)
{
if (_cpuExtension_AVX2)
#if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
}
else if (indexType == LatteIndexType::U16_LE)
{
@ -714,4 +726,4 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex;
}
}

View file

@ -2,6 +2,7 @@
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LatteTexture.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Common/cpu_features.h"
std::unordered_set<LatteTexture*> g_allTextures;
@ -146,7 +147,7 @@ uint32 LatteTexture_CalculateTextureDataHash(LatteTexture* hostTexture)
if( isCompressedFormat == false )
{
#if BOOST_OS_WINDOWS
if (_cpuExtension_AVX2)
if (g_CPUFeatures.x86.avx2)
{
__m256i h256 = { 0 };
__m256i* readPtr = (__m256i*)texDataU32;

View file

@ -1075,7 +1075,9 @@ namespace coreinit
{
OSHostThread* hostThread = (OSHostThread*)_thread;
#if defined(ARCH_X86_64)
_mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero
#endif
uint32 lehmer_lcg = 12345;
@ -1116,7 +1118,9 @@ namespace coreinit
{
SetThreadName(fmt::format("OSSchedulerThread[core={}]", (uintptr_t)_assignedCoreIndex).c_str());
t_assignedCoreIndex = (sint32)(uintptr_t)_assignedCoreIndex;
#if defined(ARCH_X86_64)
_mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero
#endif
t_schedulerFiber = Fiber::PrepareCurrentThread();
// create scheduler idle fiber and switch to it

View file

@ -1,5 +1,7 @@
add_library(CemuCommon
betype.h
cpu_features.cpp
cpu_features.h
enumFlags.h
ExceptionHandler/ExceptionHandler.h
FileStream.h

101
src/Common/cpu_features.cpp Normal file
View file

@ -0,0 +1,101 @@
#include "cpu_features.h"
// wrappers with uniform prototype for implementation-specific x86 CPU id
#if defined(ARCH_X86_64)
#ifdef __GNUC__
#include <cpuid.h>
#endif
inline void cpuid(int cpuInfo[4], int functionId) {
#if defined(_MSC_VER)
__cpuid(cpuInfo, functionId);
#elif defined(__GNUC__)
__cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuid
#endif
}
inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) {
#if defined(_MSC_VER)
__cpuidex(cpuInfo, functionId, subFunctionId);
#elif defined(__GNUC__)
__cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuidex
#endif
}
#endif
CPUFeaturesImpl::CPUFeaturesImpl()
{
#if defined(ARCH_X86_64)
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
x86.lzcnt = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
x86.movbe = ((cpuInfo[2] >> 22) & 1) != 0;
x86.avx = ((cpuInfo[2] >> 28) & 1) != 0;
x86.aesni = ((cpuInfo[2] >> 25) & 1) != 0;
x86.ssse3 = ((cpuInfo[2] >> 9) & 1) != 0;
x86.sse4_1 = ((cpuInfo[2] >> 19) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
x86.avx2 = ((cpuInfo[1] >> 5) & 1) != 0;
x86.bmi2 = ((cpuInfo[1] >> 8) & 1) != 0;
cpuid(cpuInfo, 0x80000007);
x86.invariant_tsc = ((cpuInfo[3] >> 8) & 1);
// get CPU brand name
uint32_t nExIds, i = 0;
memset(m_cpuBrandName, 0, sizeof(m_cpuBrandName));
cpuid(cpuInfo, 0x80000000);
nExIds = (uint32_t)cpuInfo[0];
for (uint32_t i = 0x80000000; i <= nExIds; ++i)
{
cpuid(cpuInfo, i);
if (i == 0x80000002)
memcpy(m_cpuBrandName, cpuInfo, sizeof(cpuInfo));
else if (i == 0x80000003)
memcpy(m_cpuBrandName + 16, cpuInfo, sizeof(cpuInfo));
else if (i == 0x80000004)
memcpy(m_cpuBrandName + 32, cpuInfo, sizeof(cpuInfo));
}
#endif
}
std::string CPUFeaturesImpl::GetCPUName()
{
return { m_cpuBrandName };
}
std::string CPUFeaturesImpl::GetCommaSeparatedExtensionList()
{
std::string tmp;
auto appendExt = [&tmp](const char* str)
{
if (!tmp.empty())
tmp.append(", ");
tmp.append(str);
};
if (x86.ssse3)
appendExt("SSSE3");
if (x86.sse4_1)
appendExt("SSE4.1");
if (x86.avx)
appendExt("AVX");
if (x86.avx2)
appendExt("AVX2");
if (x86.lzcnt)
appendExt("LZCNT");
if (x86.movbe)
appendExt("MOVBE");
if (x86.bmi2)
appendExt("BMI2");
if (x86.aesni)
appendExt("AES-NI");
if(x86.invariant_tsc)
appendExt("INVARIANT-TSC");
return tmp;
}
CPUFeaturesImpl g_CPUFeatures;

26
src/Common/cpu_features.h Normal file
View file

@ -0,0 +1,26 @@
class CPUFeaturesImpl
{
public:
CPUFeaturesImpl();
std::string GetCPUName(); // empty if not available
std::string GetCommaSeparatedExtensionList();
struct
{
bool ssse3{ false };
bool sse4_1{ false };
bool avx{ false };
bool avx2{ false };
bool lzcnt{ false };
bool movbe{ false };
bool bmi2{ false };
bool aesni{ false };
bool invariant_tsc{ false };
}x86;
private:
char m_cpuBrandName[0x40]{ 0 };
};
extern CPUFeaturesImpl g_CPUFeatures;

View file

@ -24,13 +24,22 @@
// }
// #endif
// arch defines
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
#define ARCH_X86_64
#endif
// c includes
#include <cstdint>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cassert>
#if defined(ARCH_X86_64)
#include <immintrin.h>
#endif
// c++ includes
#include <string>
@ -106,11 +115,6 @@ using uint8le = uint8_t;
#include "Cemu/Logging/CemuDebugLogging.h"
#include "Cemu/Logging/CemuLogging.h"
// CPU extensions
extern bool _cpuExtension_SSSE3;
extern bool _cpuExtension_SSE4_1;
extern bool _cpuExtension_AVX2;
// manual endian-swapping
#if _MSC_VER
@ -251,30 +255,35 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
#define NOEXPORT __attribute__ ((visibility ("hidden")))
#endif
#ifdef __GNUC__
#include <cpuid.h>
#endif
// On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
#if defined(__aarch64__)
inline void cpuid(int cpuInfo[4], int functionId) {
#if defined(_MSC_VER)
__cpuid(cpuInfo, functionId);
#elif defined(__GNUC__)
__cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuid
#endif
inline void _mm_pause()
{
asm volatile("yield");
}
inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) {
#if defined(_MSC_VER)
__cpuidex(cpuInfo, functionId, subFunctionId);
#elif defined(__GNUC__)
__cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuidex
#endif
inline uint64 __rdtsc()
{
uint64 t;
asm volatile("mrs %0, cntvct_el0" : "=r" (t));
return t;
}
inline void _mm_mfence()
{
}
inline unsigned char _addcarry_u64(unsigned char carry, unsigned long long a, unsigned long long b, unsigned long long *result)
{
*result = a + b + (unsigned long long)carry;
if (*result < a)
return 1;
return 0;
}
#endif
// MEMPTR
#include "Common/MemPtr.h"

View file

@ -1,39 +1,47 @@
project(CemuAsm C)
if (WIN32)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
enable_language(C ASM_MASM)
if (WIN32)
add_library(CemuAsm x64util_masm.asm)
set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM)
enable_language(C ASM_MASM)
# workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails
# doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem
# possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889
set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY "<CMAKE_AR> /OUT:<TARGET> <LINK_FLAGS> <OBJECTS>")
add_library(CemuAsm x64util_masm.asm)
set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM)
# workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails
# doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem
# possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889
set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY "<CMAKE_AR> /OUT:<TARGET> <LINK_FLAGS> <OBJECTS>")
set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
else()
# NASM
if (APPLE)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f macho64 --prefix _ -o <OBJECT> <SOURCE>")
else()
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f elf64 -o <OBJECT> <SOURCE>")
endif()
set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld <FLAGS> <CMAKE_ASM_NASM_LINK_FLAGS> <LINK_FLAGS> -fPIC <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
enable_language(C ASM_NASM)
add_library(CemuAsm x64util_nasm.asm)
set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM)
if (APPLE)
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64)
else()
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64)
endif()
set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)")
add_library(CemuAsm stub.cpp)
else()
# NASM
if (APPLE)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f macho64 --prefix _ -o <OBJECT> <SOURCE>")
else()
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f elf64 -o <OBJECT> <SOURCE>")
endif()
set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld <FLAGS> <CMAKE_ASM_NASM_LINK_FLAGS> <LINK_FLAGS> -fPIC <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
enable_language(C ASM_NASM)
add_library(CemuAsm x64util_nasm.asm)
set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM)
if (APPLE)
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64)
else()
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64)
endif()
set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C)
message(STATUS "CemuAsm - Unsupported arch: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

1
src/asm/stub.cpp Normal file
View file

@ -0,0 +1 @@

View file

@ -1,4 +1,20 @@
#pragma once
#if defined(ARCH_X86_64)
extern "C" void recompiler_fres();
extern "C" void recompiler_frsqrte();
#else
// stubbed on non-x86 for now
static void recompiler_fres()
{
cemu_assert_unimplemented();
}
static void recompiler_frsqrte()
{
cemu_assert_unimplemented();
}
#endif

View file

@ -64,6 +64,7 @@ void CemuConfig::Load(XMLConfigParser& parser)
save_screenshot = parser.get("save_screenshot", save_screenshot);
did_show_vulkan_warning = parser.get("vk_warning", did_show_vulkan_warning);
did_show_graphic_pack_download = parser.get("gp_download", did_show_graphic_pack_download);
did_show_macos_disclaimer = parser.get("macos_disclaimer", did_show_macos_disclaimer);
fullscreen = parser.get("fullscreen", fullscreen);
proxy_server = parser.get("proxy_server", "");
@ -365,6 +366,7 @@ void CemuConfig::Save(XMLConfigParser& parser)
config.set<bool>("save_screenshot", save_screenshot);
config.set<bool>("vk_warning", did_show_vulkan_warning);
config.set<bool>("gp_download", did_show_graphic_pack_download);
config.set<bool>("macos_disclaimer", did_show_macos_disclaimer);
config.set<bool>("fullscreen", fullscreen);
config.set("proxy_server", proxy_server.GetValue().c_str());

View file

@ -407,6 +407,7 @@ struct CemuConfig
ConfigValue<bool> did_show_vulkan_warning{false};
ConfigValue<bool> did_show_graphic_pack_download{false};
ConfigValue<bool> did_show_macos_disclaimer{false};
int game_list_style = 0;
std::string game_list_column_order;

View file

@ -174,6 +174,23 @@ bool CemuApp::OnInit()
SetTopWindow(m_mainFrame);
m_mainFrame->Show();
// show warning on macOS about state of builds
#if BOOST_OS_MACOS
if (!GetConfig().did_show_macos_disclaimer)
{
const auto message = _(
"Thank you for testing the in-development build of Cemu for macOS.\n \n"
"The macOS port is currently purely experimental and should not be considered stable or ready for issue-free gameplay. "
"There are also known issues with degraded performance due to the use of MoltenVk and Rosetta for ARM Macs. We appreciate your patience while we improve Cemu for macOS.");
wxMessageDialog dialog(nullptr, message, "Preview version", wxCENTRE | wxOK | wxICON_WARNING);
dialog.SetOKLabel(_("I understand"));
dialog.ShowModal();
GetConfig().did_show_macos_disclaimer = true;
g_config.Save();
}
#endif
return true;
}

View file

@ -19,6 +19,7 @@
#include "Cafe/TitleList/SaveList.h"
#include "Common/ExceptionHandler/ExceptionHandler.h"
#include "Common/cpu_features.h"
#include <wx/setup.h>
#include "util/helpers/helpers.h"
@ -37,8 +38,13 @@
#define SDL_MAIN_HANDLED
#include <SDL.h>
#if BOOST_OS_LINUX || BOOST_OS_MACOS
#if BOOST_OS_LINUX
#define _putenv(__s) putenv((char*)(__s))
#include <sys/sysinfo.h>
#elif BOOST_OS_MACOS
#define _putenv(__s) putenv((char*)(__s))
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if BOOST_OS_WINDOWS
@ -49,41 +55,32 @@ extern "C"
}
#endif
bool _cpuExtension_SSSE3 = false;
bool _cpuExtension_SSE4_1 = false;
bool _cpuExtension_AVX2 = false;
std::atomic_bool g_isGPUInitFinished = false;
std::wstring executablePath;
void logCPUAndMemoryInfo()
{
#if BOOST_OS_WINDOWS
int CPUInfo[4] = { -1 };
unsigned nExIds, i = 0;
char CPUBrandString[0x40];
// Get the information associated with each extended ID.
cpuid(CPUInfo, 0x80000000);
nExIds = CPUInfo[0];
for (i = 0x80000000; i <= nExIds; ++i)
{
cpuid(CPUInfo, i);
// Interpret CPU brand string
if (i == 0x80000002)
memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000003)
memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000004)
memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo));
}
forceLog_printf("CPU: %s", CPUBrandString);
std::string cpuName = g_CPUFeatures.GetCPUName();
if (!cpuName.empty())
cemuLog_log(LogType::Force, "CPU: {}", cpuName);
#if BOOST_OS_WINDOWS
MEMORYSTATUSEX statex;
statex.dwLength = sizeof(statex);
GlobalMemoryStatusEx(&statex);
uint32 memoryInMB = (uint32)(statex.ullTotalPhys / 1024LL / 1024LL);
forceLog_printf("RAM: %uMB", memoryInMB);
#elif BOOST_OS_LINUX
struct sysinfo info {};
sysinfo(&info);
cemuLog_log(LogType::Force, "RAM: {}MB", ((static_cast<uint64_t>(info.totalram) * info.mem_unit) / 1024LL / 1024LL));
#elif BOOST_OS_MACOS
int64_t totalRam;
size_t size = sizeof(totalRam);
int result = sysctlbyname("hw.memsize", &totalRam, &size, NULL, 0);
if (result == 0)
cemuLog_log(LogType::Force, "RAM: {}MB", (totalRam / 1024LL / 1024LL));
#endif
}
@ -120,32 +117,7 @@ void infoLog_cemuStartup()
checkForWine();
// CPU and RAM info
logCPUAndMemoryInfo();
// extensions that Cemu uses
char cpuExtensionStr[256];
strcpy(cpuExtensionStr, "");
if (_cpuExtension_SSSE3)
{
strcat(cpuExtensionStr, "SSSE3");
}
if (_cpuExtension_SSE4_1)
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "SSE4.1");
}
if (_cpuExtension_AVX2)
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "AVX2");
}
if (AES128_useAESNI())
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "AES-NI");
}
cemuLog_force("Used CPU extensions: {}", cpuExtensionStr);
cemuLog_log(LogType::Force, "Used CPU extensions: {}", g_CPUFeatures.GetCommaSeparatedExtensionList());
}
// some implementations of _putenv dont copy the string and instead only store a pointer
@ -194,14 +166,6 @@ void mainEmulatorCommonInit()
AES128_init();
// init PPC timer (call this as early as possible because it measures frequency of RDTSC using an asynchronous thread over 3 seconds)
PPCTimer_init();
// check available CPU extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x1);
_cpuExtension_SSSE3 = ((cpuInfo[2] >> 9) & 1) != 0;
_cpuExtension_SSE4_1 = ((cpuInfo[2] >> 19) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
_cpuExtension_AVX2 = ((cpuInfo[1] >> 5) & 1) != 0;
#if BOOST_OS_WINDOWS
executablePath.resize(4096);
@ -382,4 +346,4 @@ int main(int argc, char *argv[])
extern "C" DLLEXPORT uint64 gameMeta_getTitleId()
{
return CafeSystem::GetForegroundTitleId();
}
}

View file

@ -10,6 +10,7 @@
/* Includes: */
/*****************************************************************************/
#include "aes128.h"
#include "Common/cpu_features.h"
/*****************************************************************************/
/* Defines: */
@ -23,8 +24,6 @@
// The number of rounds in AES Cipher.
#define Nr 10
bool useAESNI = false;
typedef uint8 state_t[4][4];
typedef struct
@ -601,6 +600,7 @@ void AES128_CBC_decrypt_updateIV(uint8* output, uint8* input, uint32 length, con
memcpy(iv, newIv, KEYLEN);
}
#if defined(ARCH_X86_64)
inline __m128i AESNI128_ASSIST(
__m128i temp1,
__m128i temp2)
@ -792,6 +792,7 @@ void __aesni__AES128_ECB_encrypt(uint8* input, const uint8* key, uint8* output)
feedback = _mm_aesenclast_si128(feedback, ((__m128i*)expandedKey)[10]);
_mm_storeu_si128(&((__m128i*)output)[0], feedback);
}
#endif
void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output);
void (*AES128_CBC_decrypt)(uint8* output, uint8* input, uint32 length, const uint8* key, const uint8* iv) = nullptr;
@ -836,10 +837,8 @@ void AES128_init()
lookupTable_multiply[i] = (vE << 0) | (v9 << 8) | (vD << 16) | (vB << 24);
}
// check if AES-NI is available
int v[4];
cpuid(v, 1);
useAESNI = (v[2] & 0x2000000) != 0;
if (useAESNI)
#if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.aesni)
{
// AES-NI implementation
AES128_CBC_decrypt = __aesni__AES128_CBC_decrypt;
@ -851,9 +850,8 @@ void AES128_init()
AES128_CBC_decrypt = __soft__AES128_CBC_decrypt;
AES128_ECB_encrypt = __soft__AES128_ECB_encrypt;
}
#else
AES128_CBC_decrypt = __soft__AES128_CBC_decrypt;
AES128_ECB_encrypt = __soft__AES128_ECB_encrypt;
#endif
}
bool AES128_useAESNI()
{
return useAESNI;
}

View file

@ -2,7 +2,6 @@
#define _AES_H_
void AES128_init();
bool AES128_useAESNI();
extern void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output);

View file

@ -19,7 +19,8 @@ public:
{
if (!m_lockBool.exchange(true, std::memory_order_acquire))
break;
while (m_lockBool.load(std::memory_order_relaxed)) _mm_pause();
while (m_lockBool.load(std::memory_order_relaxed))
_mm_pause();
}
}
@ -36,4 +37,4 @@ public:
private:
mutable std::atomic<bool> m_lockBool = false;
};
};