Skip to content

Runtime SSE4.2 detection: #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ CC= $(DEFAULT_CC)
# unwinding are not affected -- the assembler part has frame unwind
# information and GCC emits it where needed (x64) or with -g (see CCDEBUG).
CCOPT= -O2 -fomit-frame-pointer
# only apply SSE4.2 to target or else host minilua will fail on non-sse4.2 hardware
# TARGET_CFLAGS=-msse4.2
# Use this if you want to generate a smaller binary (but it's slower):
#CCOPT= -Os -fomit-frame-pointer
# Note: it's no longer recommended to use -O3 with GCC 4.x.
Expand Down
101 changes: 1 addition & 100 deletions src/lib_jit.c
Original file line number Diff line number Diff line change
Expand Up @@ -659,114 +659,15 @@ JIT_PARAMDEF(JIT_PARAMINIT)
};
#endif

#if LJ_TARGET_ARM && LJ_TARGET_LINUX
#include <sys/utsname.h>
#endif

/* Arch-dependent CPU detection. */
static uint32_t jit_cpudetect(lua_State *L)
{
uint32_t flags = 0;
#if LJ_TARGET_X86ORX64
uint32_t vendor[4];
uint32_t features[4];
if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
#if !LJ_HASJIT
#define JIT_F_SSE2 2
#endif
flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
#if LJ_HASJIT
flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
if (vendor[2] == 0x6c65746e) { /* Intel. */
if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
flags |= JIT_F_LEA_AGU;
} else if (vendor[2] == 0x444d4163) { /* AMD. */
uint32_t fam = (features[0] & 0x0ff00f00);
if (fam >= 0x00000f00) /* K8, K10. */
flags |= JIT_F_PREFER_IMUL;
}
if (vendor[0] >= 7) {
uint32_t xfeatures[4];
lj_vm_cpuid(7, xfeatures);
flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
}
#endif
}
/* Check for required instruction set support on x86 (unnecessary on x64). */
#if LJ_TARGET_X86
if (!(flags & JIT_F_SSE2))
luaL_error(L, "CPU with SSE2 required");
#endif
#elif LJ_TARGET_ARM
#if LJ_HASJIT
int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
#if LJ_TARGET_LINUX
if (ver < 70) { /* Runtime ARM CPU detection. */
struct utsname ut;
uname(&ut);
if (strncmp(ut.machine, "armv", 4) == 0) {
if (ut.machine[4] >= '7')
ver = 70;
else if (ut.machine[4] == '6')
ver = 60;
}
}
#endif
flags |= ver >= 70 ? JIT_F_ARMV7 :
ver >= 61 ? JIT_F_ARMV6T2_ :
ver >= 60 ? JIT_F_ARMV6_ : 0;
flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2;
#endif
#elif LJ_TARGET_ARM64
/* No optional CPU features to detect (for now). */
#elif LJ_TARGET_PPC
#if LJ_HASJIT
#if LJ_ARCH_SQRT
flags |= JIT_F_SQRT;
#endif
#if LJ_ARCH_ROUND
flags |= JIT_F_ROUND;
#endif
#endif
#elif LJ_TARGET_MIPS
#if LJ_HASJIT
/* Compile-time MIPS CPU detection. */
#if LJ_ARCH_VERSION >= 20
flags |= JIT_F_MIPSXXR2;
#endif
/* Runtime MIPS CPU detection. */
#if defined(__GNUC__)
if (!(flags & JIT_F_MIPSXXR2)) {
int x;
#ifdef __mips16
x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */
#else
/* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */
__asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2");
#endif
if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */
}
#endif
#endif
#else
#error "Missing CPU detection for this architecture"
#endif
UNUSED(L);
return flags;
}

/* Initialize JIT compiler. */
static void jit_init(lua_State *L)
{
uint32_t flags = jit_cpudetect(L);
#if LJ_HASJIT
jit_State *J = L2J(L);
J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
J->flags = J->flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
memcpy(J->param, jit_param_default, sizeof(J->param));
lj_dispatch_update(G(L));
#else
UNUSED(flags);
#endif
}

Expand Down
1 change: 1 addition & 0 deletions src/lj_jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define JIT_F_PREFER_IMUL 0x00000080
#define JIT_F_LEA_AGU 0x00000100
#define JIT_F_BMI2 0x00000200
#define JIT_F_SSE4_2 0x00000400

/* Names for the CPU-specific flags. Must match the order above. */
#define JIT_F_CPU_FIRST JIT_F_SSE2
Expand Down
116 changes: 116 additions & 0 deletions src/lj_state.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,106 @@ static void close_state(lua_State *L)
g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0);
}

#if LJ_TARGET_ARM && LJ_TARGET_LINUX
#include <sys/utsname.h>
#endif

/* Arch-dependent CPU detection. */
static uint32_t _cpudetect(lua_State *L)
{
uint32_t flags = 0;
#if LJ_TARGET_X86ORX64
uint32_t vendor[4];
uint32_t features[4];
if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
#if !LJ_HASJIT
#define JIT_F_SSE2 2
#endif
flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
#if LJ_HASJIT
flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2;
if (vendor[2] == 0x6c65746e) { /* Intel. */
if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
flags |= JIT_F_LEA_AGU;
} else if (vendor[2] == 0x444d4163) { /* AMD. */
uint32_t fam = (features[0] & 0x0ff00f00);
if (fam >= 0x00000f00) /* K8, K10. */
flags |= JIT_F_PREFER_IMUL;
}
if (vendor[0] >= 7) {
uint32_t xfeatures[4];
lj_vm_cpuid(7, xfeatures);
flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
}
#endif
}
/* Check for required instruction set support on x86 (unnecessary on x64). */
#if LJ_TARGET_X86
if (!(flags & JIT_F_SSE2))
luaL_error(L, "CPU with SSE2 required");
#endif
#elif LJ_TARGET_ARM
#if LJ_HASJIT
int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
#if LJ_TARGET_LINUX
if (ver < 70) { /* Runtime ARM CPU detection. */
struct utsname ut;
uname(&ut);
if (strncmp(ut.machine, "armv", 4) == 0) {
if (ut.machine[4] >= '7')
ver = 70;
else if (ut.machine[4] == '6')
ver = 60;
}
}
#endif
flags |= ver >= 70 ? JIT_F_ARMV7 :
ver >= 61 ? JIT_F_ARMV6T2_ :
ver >= 60 ? JIT_F_ARMV6_ : 0;
flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2;
#endif
#elif LJ_TARGET_ARM64
/* No optional CPU features to detect (for now). */
#elif LJ_TARGET_PPC
#if LJ_HASJIT
#if LJ_ARCH_SQRT
flags |= JIT_F_SQRT;
#endif
#if LJ_ARCH_ROUND
flags |= JIT_F_ROUND;
#endif
#endif
#elif LJ_TARGET_MIPS
#if LJ_HASJIT
/* Compile-time MIPS CPU detection. */
#if LJ_ARCH_VERSION >= 20
flags |= JIT_F_MIPSXXR2;
#endif
/* Runtime MIPS CPU detection. */
#if defined(__GNUC__)
if (!(flags & JIT_F_MIPSXXR2)) {
int x;
#ifdef __mips16
x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */
#else
/* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */
__asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2");
#endif
if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */
}
#endif
#endif
#else
#error "Missing CPU detection for this architecture"
#endif
UNUSED(L);
return flags;
}

extern void x64_init_random();

#if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC))
lua_State *lj_state_newstate(lua_Alloc f, void *ud)
#else
Expand All @@ -188,7 +288,20 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud)
{
GG_State *GG = (GG_State *)f(ud, NULL, 0, sizeof(GG_State));
lua_State *L = &GG->L;

/* detect cpu features as early as possible */
/* and init random table if we have SSE4.2 support */
uint32_t flags = _cpudetect(L);

#if defined(__SSE4_2__)
if (flags & JIT_F_SSE4_2)
{
x64_init_random();
}
#endif

global_State *g = &GG->g;

if (GG == NULL || !checkptrGC(GG)) return NULL;
memset(GG, 0, sizeof(GG_State));
L->gct = ~LJ_TTHREAD;
Expand Down Expand Up @@ -219,6 +332,9 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud)
g->gc.stepmul = LUAI_GCMUL;
lj_dispatch_init((GG_State *)L);
L->status = LUA_ERRERR+1; /* Avoid touching the stack upon memory error. */

G2J(g)->flags = flags; /* copy detected flags to jit state */

if (lj_vm_cpcall(L, NULL, NULL, cpluaopen) != 0) {
/* Memory allocation error: free partial state. */
close_state(L);
Expand Down
21 changes: 14 additions & 7 deletions src/lj_str.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "lj_err.h"
#include "lj_str.h"
#include "lj_char.h"
#include "lj_dispatch.h" /* for G2J */

/* -- String helpers ------------------------------------------------------ */

Expand Down Expand Up @@ -165,12 +166,6 @@ lj_str_indep_hash(GCstr *str) {

#include "x64/src/lj_str_hash_x64.h"

#if defined(LJ_ARCH_STR_HASH)
#define LJ_STR_HASH LJ_ARCH_STR_HASH
#else
#define LJ_STR_HASH lj_str_original_hash
#endif

/* Intern a string and return string object. */
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
{
Expand All @@ -187,7 +182,19 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
return &g->strempty;
}

h = LJ_STR_HASH(str, lenx);
/* switch between sse and non-sse hash branches */
#if defined(__SSE4_2__)
if ((G2J(g)->flags & JIT_F_SSE4_2))
{
h = lj_str_sse_hash(str, lenx);
}
else
{
h = lj_str_original_hash(str, lenx);
}
#else
h = lj_str_original_hash(str, lenx);
#endif

/* Check if the string has already been interned. */
o = gcref(g->strhash[h & g->strmask]);
Expand Down
12 changes: 2 additions & 10 deletions src/x64/src/lj_str_hash_x64.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ static LJ_AINLINE uint32_t log2_floor(uint32_t n)
/* This function is to populate `random_pos` such that random_pos[i][*]
* contains random value in the range of [2**i, 2**(i+1)).
*/
static void x64_init_random(void)
void x64_init_random(void)
{
int i, seed, rml;

Expand Down Expand Up @@ -185,11 +185,6 @@ static void x64_init_random(void)
}
#undef POW2_MASK

void __attribute__((constructor)) x64_init_random_constructor()
{
x64_init_random();
}

/* Return a pre-computed random number in the range of [1**chunk_sz_order,
* 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value
* may be greater than chunk-size; it is up to the caller to make sure
Expand Down Expand Up @@ -246,7 +241,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str,
}

/* NOTE: the "len" should not be zero */
static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len)
static LJ_AINLINE uint32_t lj_str_sse_hash(const char* str, size_t len)
{
if (len < 128) {
if (len >= 16) { /* [16, 128) */
Expand All @@ -264,8 +259,5 @@ static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len)
return lj_str_hash_128_above(str, len);
}

#define LJ_ARCH_STR_HASH lj_str_hash
#else
#undef LJ_ARCH_STR_HASH
#endif
#endif /*_LJ_STR_HASH_X64_H_*/
Empty file modified src/x64/test/unit_test.sh
100644 → 100755
Empty file.