the-argus
diff --git a/‎.gdbinit
+1 b/‎.gdbinit
+1
diff --git a/‎build.zig
+31-10 b/‎build.zig
+31-10
diff --git a/‎include/vmath/client_query.h
+25 b/‎include/vmath/client_query.h
+25
diff --git a/‎include/vmath/decl/vec16_f32.h
+78 b/‎include/vmath/decl/vec16_f32.h
+78
@@ -0,0 +1 @@
+set environment CK_FORK=no
@@ -3,16 +3,25 @@ const builtin = @import("builtin");
 const zcc = @import("compile_commands");
 const app_name = "vmath";
 
-const test_flags = &[_][]const u8{
+const lib_flags = &[_][]const u8{
     "-std=c99", // need inline and restrict
     "-pedantic",
     "-Wall",
+    "-Werror",
+    "-march=znver1",
+
+    // flag only for lib
     "-Iinclude/",
-    "-march=znver1", // my pc architecture
 };
 
+// test flags dont include "-Iinclude"
+const test_flags = lib_flags[0..(lib_flags.len - 1)];
+
 const test_source_files = &[_][]const u8{
     "vec2_f32.c",
+    "vec4_f32.c",
+    "vec8_f32.c",
+    "vec16_f32.c",
 };
 
 pub fn build(b: *std.Build) !void {
@@ -23,14 +32,24 @@ pub fn build(b: *std.Build) !void {
     var tests = std.ArrayList(*std.Build.Step.Compile).init(b.allocator);
     defer tests.deinit();
 
-    b.installDirectory(.{
-        .source_dir = .{ .src_path = .{
-            .sub_path = "include/vmath/",
-            .owner = b,
-        } },
-        .install_dir = .header,
-        .install_subdir = "vmath/",
+    var lib = b.addStaticLibrary(.{
+        .name = "vmath",
+        .optimize = optimize,
+        .target = target,
+        // TODO: figure out how to not have to link libc, needed for mm_malloc
+        // in xmmtrin but im pretty sure its not needed in theory
+        .link_libc = true,
+    });
+    lib.addCSourceFiles(.{
+        .root = b.path("src/"),
+        .files = &.{
+            "impl.c",
+            "memutil.c",
+        },
+        .flags = lib_flags,
     });
+    lib.installHeadersDirectory(b.path("include/"), "", .{});
+    b.installArtifact(lib);
 
     for (test_source_files) |source_file| {
         var test_exe = b.addExecutable(.{
@@ -45,8 +64,9 @@ pub fn build(b: *std.Build) !void {
             } },
             .flags = test_flags,
         });
-        test_exe.linkLibCpp();
+        test_exe.linkLibC();
         test_exe.linkSystemLibrary("check");
+        test_exe.linkLibrary(lib);
         try tests.append(test_exe);
     }
 
@@ -65,5 +85,6 @@ pub fn build(b: *std.Build) !void {
 
     try @import("templates/build.zig").generate(b, "code");
 
+    try tests.append(lib); // get intellisense for tests + lib
     zcc.createStep(b, "cdb", try tests.toOwnedSlice());
 }
@@ -0,0 +1,25 @@
+/*
+ * Header which defines functions related to querying information about the
+ * running program: runtime known cpu features, compiled features, etc
+ */
+#ifndef __VMATH_CLIENT_QUERY_H
+#define __VMATH_CLIENT_QUERY_H
+
+typedef enum
+{
+	VM_FEATURE_SCALAR = 0x0,
+	VM_FEATURE_SSE41 = 0x1,
+	VM_FEATURE_AVX = 0x2,
+	VM_FEATURE_AVX512 = 0x4,
+	VM_FEATURE_ARM_NEON = 0x8,
+	VM_FEATURE_RISCV_V1 = 0x10,
+} vm_feature_flags;
+
+vm_feature_flags vm_get_features(void);
+
+/// Get a string describing the feature which grants the current largest
+/// available simd register size. on arm, this is always "Arm Neon". On x86 with
+/// AVX2 but not AVX512, this will return "x86 AVX2".
+const char* vm_get_feature_string(void);
+
+#endif
@@ -0,0 +1,78 @@
+#ifndef __VMATH_DECL_VEC16_F32_H
+#define __VMATH_DECL_VEC16_F32_H
+
+#include "vmath/internal/intrinsics.h"
+#include "vmath/internal/stdfloat.h"
+
+typedef struct VMATH_ALIGNED(64)
+{
+	vm_float32_t buffer[16];
+} vm_v16fs_t;
+
+#if defined(VMATH_AVX512_GENERIC_ENABLE)
+
+typedef __m512 vm_v16f_t;
+
+#elif defined(VMATH_AVX256_GENERIC_ENABLE)
+
+// emulate 512 bits with 2x256
+typedef struct VMATH_ALIGNED(64)
+{
+	__m256 buffer[2];
+} vm_v16f_t;
+
+#elif defined(VMATH_SSE41_ENABLE)
+
+// emulate 512 bits with 4x128
+typedef struct VMATH_ALIGNED(64)
+{
+	__m128 buffer[4];
+} vm_v16f_t;
+
+#elif defined(VMATH_ARM_ENABLE) || defined(VMATH_ARM64_ENABLE)
+#error ARM SIMD not implemented
+#elif defined(VMATH_RISCV_V1_ENABLE)
+#error RISCV vector extensions not implemented
+#else
+
+typedef struct
+{
+	vm_float32_t buffer[16];
+} vm_v16f_t;
+
+#endif
+
+/// Load 16 contiguous floats from memory. Memory must be 64 byte aligned.
+VMATH_INLINE_DECL vm_v16f_t vm_load_v16f(const vm_v16fs_t* vec);
+/// Load 16 contiguous floats from memory as a buffer of floats. Memory must be
+/// 64 byte aligned.
+VMATH_INLINE_DECL vm_v16f_t vm_loadb_v16f(const vm_float32_t vec[16]);
+/// Store 8 contiguous vec2s to memory. Memory must be 64 byte aligned.
+VMATH_INLINE_DECL void vm_store_v16f(vm_v16fs_t* output, vm_v16f_t vec);
+/// Store 8 contiguous vec2s to memory as a buffer of floats. Memory must be 64
+/// byte aligned.
+VMATH_INLINE_DECL void vm_storeb_v16f(vm_float32_t output[16], vm_v16f_t vec);
+
+/// Load a float32 into all elements of a 16 element vector
+VMATH_INLINE_DECL vm_v16f_t vm_splat_v16f(vm_float32_t fill);
+
+/// Add two 16 element float32 vectors together, componentwise
+VMATH_INLINE_DECL vm_v16f_t vm_add_v16f(vm_v16f_t a, vm_v16f_t b);
+/// Subtract a 16 element float32 vector from another, componentwise
+VMATH_INLINE_DECL vm_v16f_t vm_sub_v16f(vm_v16f_t a, vm_v16f_t b);
+/// Multiply two 16 element float32 vectors together, componentwise
+VMATH_INLINE_DECL vm_v16f_t vm_mul_v16f(vm_v16f_t a, vm_v16f_t b);
+/// Divide a 16 element float32 vector by another, componentwise
+VMATH_INLINE_DECL vm_v16f_t vm_div_v16f(vm_v16f_t a, vm_v16f_t b);
+
+/// Add a constant float32 value to all the elements of a 16 element vector
+VMATH_INLINE_DECL vm_v16f_t vm_addc_v16f(vm_v16f_t a, vm_float32_t b);
+/// Subtract a constant float32 value from all the elements of a 16 element
+/// vector
+VMATH_INLINE_DECL vm_v16f_t vm_subc_v16f(vm_v16f_t a, vm_float32_t b);
+/// Multiply all the elements of a 16 element vector by a constant float32 value
+VMATH_INLINE_DECL vm_v16f_t vm_mulc_v16f(vm_v16f_t a, vm_float32_t b);
+/// Divide all the elements of a 16 element vector by a constant float32 value
+VMATH_INLINE_DECL vm_v16f_t vm_divc_v16f(vm_v16f_t a, vm_float32_t b);
+
+#endif // ifndef __VMATH_DECL_VEC16_F32_H