diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst
index b3d214a7d8164..7e9505e62a9cd 100644
--- a/doc/manual/calling-c-and-fortran-code.rst
+++ b/doc/manual/calling-c-and-fortran-code.rst
@@ -564,7 +564,8 @@ In the future, some of these restrictions may be reduced or eliminated.
 SIMD Values
 ~~~~~~~~~~~
 
-Note: This feature is currently implemented on 64-bit x86 platforms only.
+Note: This feature is currently implemented on 64-bit x86
+and AArch64 platforms only.
 
 If a C/C++ routine has an argument or return value that is a native
 SIMD type, the corresponding Julia type is a homogeneous tuple
diff --git a/src/abi_aarch64.cpp b/src/abi_aarch64.cpp
index b71328d274e9b..c53b02a2c9cb0 100644
--- a/src/abi_aarch64.cpp
+++ b/src/abi_aarch64.cpp
@@ -16,11 +16,53 @@ namespace {
 typedef bool AbiState;
 static const AbiState default_abi_state = 0;
 
+static Type *get_llvm_vectype(jl_datatype_t *dt)
+{
+    // Assume jl_is_datatype(dt) && !jl_is_abstracttype(dt)
+    // `!dt->mutabl && dt->pointerfree && !dt->haspadding && dt->nfields > 0`
+    size_t nfields = dt->nfields;
+    assert(nfields > 0);
+    if (nfields < 2)
+        return nullptr;
+    static Type *T_vec64 = VectorType::get(T_int32, 2);
+    static Type *T_vec128 = VectorType::get(T_int32, 4);
+    Type *lltype;
+    // Short vector should be either 8 bytes or 16 bytes.
+    // Note that there are only two distinct fundamental types for
+    // short vectors so we normalize them to <2 x i32> and <4 x i32>
+    switch (dt->size) {
+    case 8:
+        lltype = T_vec64;
+        break;
+    case 16:
+        lltype = T_vec128;
+        break;
+    default:
+        return nullptr;
+    }
+    // Since `dt` is pointer free and has no padding and is 8 or 16 in size,
+    // `ft0` must be concrete, immutable with no padding and we don't need
+    // to check if its size is legal since it is included in
+    // the homogeneity check.
+    jl_datatype_t *ft0 = (jl_datatype_t*)jl_field_type(dt, 0);
+    // `ft0` should be a `VecElement` type and the true element type
+    // should be a `bitstype`
+    if (ft0->name != jl_vecelement_typename ||
+        ((jl_datatype_t*)jl_field_type(ft0, 0))->nfields)
+        return nullptr;
+    for (int i = 1; i < nfields; i++) {
+        if (jl_field_type(dt, i) != (jl_value_t*)ft0) {
+            // Not homogeneous
+            return nullptr;
+        }
+    }
+    return lltype;
+}
+
 static Type *get_llvm_fptype(jl_datatype_t *dt)
 {
     // Assume jl_is_datatype(dt) && !jl_is_abstracttype(dt)
-    if (dt->mutabl || jl_datatype_nfields(dt) != 0)
-        return NULL;
+    // `!dt->mutabl && dt->pointerfree && !dt->haspadding && dt->nfields == 0`
     Type *lltype;
     // Check size first since it's cheaper.
     switch (dt->size) {
@@ -37,9 +79,17 @@ static Type *get_llvm_fptype(jl_datatype_t *dt)
         lltype = T_float128;
         break;
     default:
-        return NULL;
+        return nullptr;
     }
-    return jl_is_floattype((jl_value_t*)dt) ? lltype : NULL;
+    return jl_is_floattype((jl_value_t*)dt) ? lltype : nullptr;
+}
+
+static Type *get_llvm_fp_or_vectype(jl_datatype_t *dt)
+{
+    // Assume jl_is_datatype(dt) && !jl_is_abstracttype(dt)
+    if (dt->mutabl || !dt->pointerfree || dt->haspadding)
+        return nullptr;
+    return dt->nfields ? get_llvm_vectype(dt) : get_llvm_fptype(dt);
 }
 
 struct ElementType {
@@ -50,8 +100,6 @@ struct ElementType {
 
 // Whether a type is a homogeneous floating-point aggregates (HFA) or a
 // homogeneous short-vector aggregates (HVA). Returns the element type.
-// We only handle HFA of HP, SP, DP and QP here since these are the only ones we
-// have (no vectors).
 // An Homogeneous Aggregate is a Composite Type where all of the Fundamental
 // Data Types of the members that compose the type are the same.
 // Note that it is the fundamental types that are important and not the member
@@ -62,6 +110,7 @@ static bool isHFAorHVA(jl_datatype_t *dt, size_t dsz, size_t &nele, ElementType
     //     dt is a pointerfree type, (all members are isbits)
     //     dsz == dt->size > 0
     //     0 <= nele <= 3
+    //     dt has no padding
 
     // We ignore zero sized member here. This isn't really consistent with
     // GCC for zero-sized array members. GCC seems to treat structs with
@@ -83,6 +132,14 @@ static bool isHFAorHVA(jl_datatype_t *dt, size_t dsz, size_t &nele, ElementType
             dt = (jl_datatype_t*)jl_field_type(dt, i);
             continue;
         }
+        if (Type *vectype = get_llvm_vectype(dt)) {
+            if ((ele.sz && dsz != ele.sz) || (ele.type && ele.type != vectype))
+                return false;
+            ele.type = vectype;
+            ele.sz = dsz;
+            nele++;
+            return true;
+        }
         // Otherwise, process each members
         for (;i < nfields;i++) {
             size_t fieldsz = jl_field_size(dt, i);
@@ -183,9 +240,7 @@ static Type *classify_arg(jl_value_t *ty, bool *fpreg, bool *onstack,
     //   the argument is allocated to the least significant bits of register
     //   v[NSRN]. The NSRN is incremented by one. The argument has now been
     //   allocated.
-    // Note that this is missing QP float as well as short vector types since we
-    // don't really have those types.
-    if (get_llvm_fptype(dt)) {
+    if (get_llvm_fp_or_vectype(dt)) {
         *fpreg = true;
         return NULL;
     }
@@ -323,7 +378,7 @@ Type *preferred_llvm_type(jl_value_t *ty, bool)
     if (!jl_is_datatype(ty) || jl_is_abstracttype(ty))
         return NULL;
     jl_datatype_t *dt = (jl_datatype_t*)ty;
-    if (Type *fptype = get_llvm_fptype(dt))
+    if (Type *fptype = get_llvm_fp_or_vectype(dt))
         return fptype;
     bool fpreg = false;
     bool onstack = false;
diff --git a/src/alloc.c b/src/alloc.c
index fda70983dd795..f3993cfc582ab 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -843,7 +843,7 @@ JL_DLLEXPORT jl_datatype_t *jl_new_uninitialized_datatype(size_t nfields, int8_t
 // For sake of Ahead-Of-Time (AOT) compilation, this routine has to work
 // without LLVM being available.
 unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t) {
-    if (!is_vecelement_type(t))
+    if (!jl_is_vecelement_type(t))
         return 0;
     // LLVM 3.7 and 3.8 either crash or generate wrong code for many
     // SIMD vector sizes N. It seems the rule is that N can have at
@@ -859,7 +859,7 @@ unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t) {
         return 0;               // nfields has more than two 1s
     assert(jl_datatype_nfields(t)==1);
     jl_value_t *ty = jl_field_type(t, 0);
-    if( !jl_is_bitstype(ty) )
+    if (!jl_is_bitstype(ty))
         // LLVM requires that a vector element be a primitive type.
         // LLVM allows pointer types as vector elements, but until a
         // motivating use case comes up for Julia, we reject pointers.
diff --git a/src/ccalltest.c b/src/ccalltest.c
index 8f286f6f8fb6d..7533aafc4490e 100644
--- a/src/ccalltest.c
+++ b/src/ccalltest.c
@@ -344,6 +344,10 @@ JL_DLLEXPORT struct16 test_16(struct16 a, float b) {
     return a;
 }
 
+// Note for AArch64:
+// `i128` is a native type on aarch64 so the type here is wrong.
+// However, it happens to have the same calling convention with `[2 x i64]`
+// when used as first argument or return value.
 #define int128_t struct3b
 JL_DLLEXPORT int128_t test_128(int128_t a, int64_t b) {
     //Unpack a Int128
@@ -393,7 +397,8 @@ JL_DLLEXPORT void *test_echo_p(void *p) {
 
 #include <xmmintrin.h>
 
-JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d ) {
+JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d )
+{
     // 64-bit x86 has only level 2 SSE, which does not have a <4 x int32> multiplication,
     // so we use floating-point instead, and assume caller knows about the hack.
     return _mm_add_epi32(a,
@@ -401,8 +406,73 @@ JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d ) {
                                                     _mm_cvtepi32_ps(_mm_sub_epi32(c,d)))));
 }
 
-JL_DLLEXPORT __m128 test_m128(__m128 a, __m128 b, __m128 c, __m128 d ) {
+JL_DLLEXPORT __m128 test_m128(__m128 a, __m128 b, __m128 c, __m128 d )
+{
     return _mm_add_ps(a, _mm_mul_ps(b, _mm_sub_ps(c, d)));
 }
 
 #endif
+
+#ifdef _CPU_AARCH64_
+
+JL_DLLEXPORT __int128 test_aa64_i128_1(int64_t v1, __int128 v2)
+{
+    return v1 * 2 - v2;
+}
+
+typedef struct {
+    int32_t v1;
+    __int128 v2;
+} struct_aa64_1;
+
+JL_DLLEXPORT struct_aa64_1 test_aa64_i128_2(int64_t v1, __int128 v2,
+                                            struct_aa64_1 v3)
+{
+    struct_aa64_1 x = {(int32_t)v1 / 2 + 1 - v3.v1, v2 * 2 - 1 - v3.v2};
+    return x;
+}
+
+typedef struct {
+    __fp16 v1;
+    double v2;
+} struct_aa64_2;
+
+JL_DLLEXPORT __fp16 test_aa64_fp16_1(int v1, float v2, double v3, __fp16 v4)
+{
+    return (__fp16)(v1 + v2 * 2 + v3 * 3 + v4 * 4);
+}
+
+JL_DLLEXPORT struct_aa64_2 test_aa64_fp16_2(int v1, float v2,
+                                            double v3, __fp16 v4)
+{
+    struct_aa64_2 x = {v4 / 2 + 1, v1 * 2 + v2 * 4 - v3};
+    return x;
+}
+
+#include <arm_neon.h>
+
+JL_DLLEXPORT int64x2_t test_aa64_vec_1(int32x2_t v1, float _v2, int32x2_t v3)
+{
+    int v2 = (int)_v2;
+    return vmovl_s32(v1 * v2 + v3);
+}
+
+// This is a homogenious short vector aggregate
+typedef struct {
+    int8x8_t v1;
+    float32x2_t v2;
+} struct_aa64_3;
+
+// This is NOT a homogenious short vector aggregate
+typedef struct {
+    float32x2_t v2;
+    int16x8_t v1;
+} struct_aa64_4;
+
+JL_DLLEXPORT struct_aa64_3 test_aa64_vec_2(struct_aa64_3 v1, struct_aa64_4 v2)
+{
+    struct_aa64_3 x = {v1.v1 + vmovn_s16(v2.v1), v1.v2 - v2.v2};
+    return x;
+}
+
+#endif
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index c0ceda7fcb1aa..62c70f327e1d0 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -385,7 +385,7 @@ static Type *julia_struct_to_llvm(jl_value_t *jt, bool *isboxed)
                 latypes.push_back(lty);
             }
             if (!isTuple) {
-                if (is_vecelement_type(jt))
+                if (jl_is_vecelement_type(jt))
                     // VecElement type is unwrapped in LLVM
                     jst->struct_decl = latypes[0];
                 else
@@ -1101,7 +1101,7 @@ static jl_cgval_t emit_getfield_knownidx(const jl_cgval_t &strct, unsigned idx,
     }
     else if (strct.ispointer()) { // something stack allocated
         Value *addr;
-        if (is_vecelement_type((jl_value_t*)jt))
+        if (jl_is_vecelement_type((jl_value_t*)jt))
             // VecElement types are unwrapped in LLVM.
             addr = strct.V;
         else
@@ -1678,7 +1678,7 @@ static jl_cgval_t emit_new_struct(jl_value_t *ty, size_t nargs, jl_value_t **arg
             // or instead initialize the stack buffer with stores
             bool init_as_value = false;
             if (lt->isVectorTy() ||
-                is_vecelement_type(ty) ||
+                jl_is_vecelement_type(ty) ||
                 type_is_ghost(lt)) // maybe also check the size ?
                 init_as_value = true;
 
@@ -1714,7 +1714,7 @@ static jl_cgval_t emit_new_struct(jl_value_t *ty, size_t nargs, jl_value_t **arg
                             strct = builder.CreateInsertValue(strct, fval, ArrayRef<unsigned>(&idx,1));
                         else {
                             // Must be a VecElement type, which comes unwrapped in LLVM.
-                            assert(is_vecelement_type(ty));
+                            assert(jl_is_vecelement_type(ty));
                             strct = fval;
                         }
                     }
diff --git a/src/julia.h b/src/julia.h
index f7e5ec96e8dfe..adcf356ca3c1b 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -954,7 +954,7 @@ STATIC_INLINE int jl_is_tuple_type(void *t)
             ((jl_datatype_t*)(t))->name == jl_tuple_typename);
 }
 
-STATIC_INLINE int is_vecelement_type(jl_value_t* t)
+STATIC_INLINE int jl_is_vecelement_type(jl_value_t* t)
 {
     return (jl_is_datatype(t) &&
             ((jl_datatype_t*)(t))->name == jl_vecelement_typename);
diff --git a/test/ccall.jl b/test/ccall.jl
index 1788c994d6e52..1ffe0edcc70f9 100644
--- a/test/ccall.jl
+++ b/test/ccall.jl
@@ -531,7 +531,27 @@ typealias VecReg{N,T} NTuple{N,VecElement{T}}
 typealias V4xF32 VecReg{4,Float32}
 typealias V4xI32 VecReg{4,Int32}
 
-if Sys.ARCH==:x86_64
+immutable Struct_AA64_1
+    v1::Int32
+    v2::Int128
+end
+immutable Struct_AA64_2
+    v1::Float16
+    v2::Float64
+end
+
+# This is a homogenious short vector aggregate
+immutable Struct_AA64_3
+    v1::VecReg{8,Int8}
+    v2::VecReg{2,Float32}
+end
+# This is NOT a homogenious short vector aggregate
+immutable Struct_AA64_4
+    v2::VecReg{2,Float32}
+    v1::VecReg{8,Int16}
+end
+
+if Sys.ARCH === :x86_64
 
     function test_sse(a1::V4xF32,a2::V4xF32,a3::V4xF32,a4::V4xF32)
         ccall((:test_m128, libccalltest), V4xF32, (V4xF32,V4xF32,V4xF32,V4xF32), a1, a2, a3, a4)
@@ -556,4 +576,63 @@ if Sys.ARCH==:x86_64
         # cfunction round-trip
         @test rt_sse(a1,a2,a3,a4) == r
     end
+elseif Sys.ARCH === :aarch64
+    for v1 in 1:99:1000, v2 in -100:-1999:-20000
+        @test ccall((:test_aa64_i128_1, libccalltest), Int128,
+                    (Int64, Int128), v1, v2) == v1 * 2 - v2
+    end
+    for v1 in 1:4, v2 in -4:-1, v3_1 in 3:5, v3_2 in 7:9
+        res = ccall((:test_aa64_i128_2, libccalltest), Struct_AA64_1,
+                    (Int64, Int128, Struct_AA64_1),
+                    v1, v2, Struct_AA64_1(v3_1, v3_2))
+        expected = Struct_AA64_1(v1 ÷ 2 + 1 - v3_1, v2 * 2 - 1 - v3_2)
+        @test res === expected
+    end
+    for v1 in 1:4, v2 in -4:-1, v3 in 3:5, v4 in -(1:3)
+        res = ccall((:test_aa64_fp16_1, libccalltest), Float16,
+                    (Cint, Float32, Float64, Float16),
+                    v1, v2, v3, v4)
+        expected = Float16(v1 + v2 * 2 + v3 * 3 + v4 * 4)
+        @test res === expected
+
+        res = ccall((:test_aa64_fp16_2, libccalltest), Struct_AA64_2,
+                    (Cint, Float32, Float64, Float16),
+                    v1, v2, v3, v4)
+        expected = Struct_AA64_2(v4 / 2 + 1, v1 * 2 + v2 * 4 - v3)
+        @test res === expected
+    end
+    for v1_1 in 1:4, v1_2 in -2:2, v2 in -4:-1, v3_1 in 3:5, v3_2 in 6:8
+        res = ccall((:test_aa64_vec_1, libccalltest),
+                    VecReg{2,Int64},
+                    (VecReg{2,Int32}, Float32, VecReg{2,Int32}),
+                    (VecElement(Int32(v1_1)), VecElement(Int32(v1_2))),
+                    v2, (VecElement(Int32(v3_1)), VecElement(Int32(v3_2))))
+        expected = (VecElement(v1_1 * v2 + v3_1), VecElement(v1_2 * v2 + v3_2))
+        @test res === expected
+    end
+    for v1_11 in 1:4, v1_12 in -2:2, v1_21 in 1:4, v1_22 in -2:2,
+        v2_11 in 1:4, v2_12 in -2:2, v2_21 in 1:4, v2_22 in -2:2
+        v1 = Struct_AA64_3((VecElement(Int8(v1_11)), VecElement(Int8(v1_12)),
+                            VecElement(Int8(0)), VecElement(Int8(0)),
+                            VecElement(Int8(0)), VecElement(Int8(0)),
+                            VecElement(Int8(0)), VecElement(Int8(0))),
+                           (VecElement(Float32(v1_21)),
+                            VecElement(Float32(v1_22))))
+        v2 = Struct_AA64_4((VecElement(Float32(v2_21)),
+                            VecElement(Float32(v2_22))),
+                           (VecElement(Int16(v2_11)), VecElement(Int16(v2_12)),
+                            VecElement(Int16(0)), VecElement(Int16(0)),
+                            VecElement(Int16(0)), VecElement(Int16(0)),
+                            VecElement(Int16(0)), VecElement(Int16(0))))
+        res = ccall((:test_aa64_vec_2, libccalltest),
+                    Struct_AA64_3, (Struct_AA64_3, Struct_AA64_4), v1, v2)
+        expected = Struct_AA64_3((VecElement(Int8(v1_11 + v2_11)),
+                                  VecElement(Int8(v1_12 + v2_12)),
+                                  VecElement(Int8(0)), VecElement(Int8(0)),
+                                  VecElement(Int8(0)), VecElement(Int8(0)),
+                                  VecElement(Int8(0)), VecElement(Int8(0))),
+                                 (VecElement(Float32(v1_21 - v2_21)),
+                                  VecElement(Float32(v1_22 - v2_22))))
+        @test res === expected
+    end
 end